From 8762c7698cc380d33042fa9e0b8ee9704a842281 Mon Sep 17 00:00:00 2001 From: hyc Date: Wed, 26 Apr 2017 01:25:33 +0100 Subject: Preliminary bionic/Android support Mostly from Crystax NDK diff --git a/config/libc/bionic.in b/config/libc/bionic.in new file mode 100644 index 0000000..20697ad --- /dev/null +++ b/config/libc/bionic.in @@ -0,0 +1,183 @@ +# bionic options + +## depends on ! WINDOWS && ! BARE_METAL +## depends on ARCH_arm || ARCH_mips || ARCH_x86 +## +## select LIBC_SUPPORT_THREADS_POSIX +## +## help Bionic is the Android C library. It is prebuilt. + +config THREADS + default "posix" + +config LIBC_BIONIC_CUSTOM + bool + prompt "Custom bionic" + help + The chosen bionic-libc version shall be not downloaded. Instead use + a custom location to get the source. + +if LIBC_BIONIC_CUSTOM + +config LIBC_BIONIC_CUSTOM_LOCATION + string + prompt "Full path to custom bionic source" + help + Enter the path to the directory or tarball of your source for bionic. + + If the path is a zip archive, it should extract to: -/ + where the name is android-ndk, and the version is set + below in the custom version string. + +config LIBC_BIONIC_CUSTOM_VERSION + string + prompt "Custom BIONIC version" + help + Enter the version number for your custom bionic. + +config LIBC_VERSION + string + default LIBC_BIONIC_CUSTOM_VERSION + +endif # LIBC_BIONIC_CUSTOM + +if ! LIBC_BIONIC_CUSTOM + +choice + bool + prompt "bionic version" +# Don't remove next line +# CT_INSERT_VERSION_BELOW + +config LIBC_BIONIC_V_15beta1 + bool + prompt "15beta1" + +config LIBC_BIONIC_V_14b + bool + prompt "14b" + +config LIBC_BIONIC_V_13b + bool + prompt "13b (OBSOLETE)" + depends on OBSOLETE + +config LIBC_BIONIC_V_12b + bool + prompt "12b (OBSOLETE)" + depends on OBSOLETE + +config LIBC_BIONIC_V_11c + bool + prompt "11c (OBSOLETE)" + depends on OBSOLETE + +config LIBC_BIONIC_V_10e + bool + prompt "10e (OBSOLETE)" + depends on OBSOLETE + +endchoice + +config LIBC_VERSION + string +# Don't remove next line +# CT_INSERT_VERSION_STRING_BELOW + default "r15-beta1" if LIBC_BIONIC_V_15beta1 + default "r14b" if LIBC_BIONIC_V_14b + default "r13b" if LIBC_BIONIC_V_13b + default "r12b" if LIBC_BIONIC_V_12b + default "r11c" if LIBC_BIONIC_V_11c + default "r10e" if LIBC_BIONIC_V_10e + +endif # ! LIBC_BIONIC_CUSTOM + +choice + bool + prompt "Android API level" + help + The minimum for 64 bit support is 21. +# Don't remove next line +# CT_INSERT_VERSION_BELOW + +config ANDROID_API_24 + bool + prompt "24" + +config ANDROID_API_23 + bool + prompt "23" + +config ANDROID_API_22 + bool + prompt "22" + +config ANDROID_API_21 + bool + prompt "21" + +config ANDROID_API_19 + bool + prompt "19" + depends on ARCH_32 + +config ANDROID_API_18 + bool + prompt "18" + depends on ARCH_32 + +config ANDROID_API_17 + bool + prompt "17" + depends on ARCH_32 + +config ANDROID_API_16 + bool + prompt "16" + depends on ARCH_32 + +config ANDROID_API_15 + bool + prompt "15" + depends on ARCH_32 + +config ANDROID_API_14 + bool + prompt "14" + depends on ARCH_32 + +config ANDROID_API_13 + bool + prompt "13" + depends on ARCH_32 + +config ANDROID_API_12 + bool + prompt "12" + depends on ARCH_32 + +config ANDROID_API_9 + bool + prompt "9" + depends on ARCH_32 + +endchoice + +config ANDROID_API + string +# Don't remove next line +# CT_INSERT_VERSION_STRING_BELOW + default "24" if ANDROID_API_24 + default "23" if ANDROID_API_23 + default "22" if ANDROID_API_22 + default "21" if ANDROID_API_21 + default "19" if ANDROID_API_19 + default "18" if ANDROID_API_18 + default "17" if ANDROID_API_17 + default "16" if ANDROID_API_16 + default "15" if ANDROID_API_15 + default "14" if ANDROID_API_14 + default "13" if ANDROID_API_13 + default "12" if ANDROID_API_12 + default "9" if ANDROID_API_9 + diff --git a/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch b/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch new file mode 100644 index 0000000..1ae1186 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch @@ -0,0 +1,17570 @@ +Based on https://github.com/crystax/android-toolchain-gcc-6/commits/master + +Up to commit e510dbdc79714512ac87126f1832f366d56623b6 but with +Crystax-specific lib support omitted +diff --git a/gcc/config.gcc b/gcc/config.gcc +index f66e48c..4380657 100644 +--- a/gcc/config.gcc ++++ b/gcc/config.gcc +@@ -374,6 +374,7 @@ i[34567]86-*-*) + rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h + adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h + avx512cdintrin.h avx512erintrin.h avx512pfintrin.h ++ arm_neon.h + shaintrin.h clflushoptintrin.h xsavecintrin.h + xsavesintrin.h avx512dqintrin.h avx512bwintrin.h + avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h +@@ -396,6 +397,7 @@ x86_64-*-*) + rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h + adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h + avx512cdintrin.h avx512erintrin.h avx512pfintrin.h ++ arm_neon.h + shaintrin.h clflushoptintrin.h xsavecintrin.h + xsavesintrin.h avx512dqintrin.h avx512bwintrin.h + avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h +@@ -942,13 +944,17 @@ aarch64*-*-elf | aarch64*-*-rtems*) + TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'` + ;; + aarch64*-*-linux*) +- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h" ++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h" + tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h" ++ extra_options="${extra_options} linux-android.opt" + tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux" + case $target in + aarch64_be-*) + tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1" + ;; ++ aarch64*-*-linux-android*) ++ tm_file="${tm_file} aarch64/aarch64-linux-android.h" ++ ;; + esac + aarch64_multilibs="${with_multilib_list}" + if test "$aarch64_multilibs" = "default"; then +@@ -2055,6 +2061,17 @@ mips*-*-linux*) # Linux MIPS, either endian. + tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h" + extra_options="${extra_options} linux-android.opt" + case ${target} in ++ mips64*android*) ++ default_mips_arch=mips64r6 ++ default_mips_abi=64 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="${tmake_file} mips/t-linux-android64" ++ ;; ++ mips*android*) ++ default_mips_arch=mips32 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="$tmake_file mips/t-linux-android" ++ ;; + mipsisa32r6*) + default_mips_arch=mips32r6 + ;; +diff --git a/gcc/config.in b/gcc/config.in +index 115cb61..9339168 100644 +--- a/gcc/config.in ++++ b/gcc/config.in +@@ -2119,6 +2119,12 @@ + #endif + + ++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */ ++#ifndef USED_FOR_TARGET ++#undef USE_EH_FRAME_HDR_FOR_STATIC ++#endif ++ ++ + /* Define to 1 if the 'long long' type is wider than 'long' but still + efficiently supported by the host hardware. */ + #ifndef USED_FOR_TARGET +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +new file mode 100644 +index 0000000..cffd84d +--- /dev/null ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -0,0 +1,63 @@ ++/* Machine description for AArch64 architecture. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ GCC is distributed in the hope that it will be useful, but ++ WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING3. If not see ++ . */ ++ ++#ifndef GCC_AARCH64_LINUX_ANDROID_H ++#define GCC_AARCH64_LINUX_ANDROID_H ++ ++ ++#undef TARGET_OS_CPP_BUILTINS ++#define TARGET_OS_CPP_BUILTINS() \ ++ do \ ++ { \ ++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ ++ } \ ++ while (0) ++ ++#undef LINK_SPEC ++#define LINK_SPEC \ ++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ ++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) ++ ++#undef CC1_SPEC ++#define CC1_SPEC \ ++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) ++ ++#undef LIB_SPEC ++#define LIB_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ ++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC) ++ ++#undef STARTFILE_SPEC ++#define STARTFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) ++ ++#undef ENDFILE_SPEC ++#define ENDFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) ++ ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif ++ ++#endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h +index 5fcaa59..6864195 100644 +--- a/gcc/config/aarch64/aarch64-linux.h ++++ b/gcc/config/aarch64/aarch64-linux.h +@@ -21,7 +21,14 @@ + #ifndef GCC_AARCH64_LINUX_H + #define GCC_AARCH64_LINUX_H + +-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifndef RUNTIME_ROOT_PREFIX ++#define RUNTIME_ROOT_PREFIX "" ++#endif ++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifdef BIONIC_DYNAMIC_LINKER ++#undef BIONIC_DYNAMIC_LINKER ++#endif ++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64" + + #undef MUSL_DYNAMIC_LINKER + #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" +diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h +index 093c38b..54b3e0c 100644 +--- a/gcc/config/alpha/elf.h ++++ b/gcc/config/alpha/elf.h +@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number; + I imagine that other systems will catch up. In the meantime, it + doesn't harm to make sure that the data exists to be used later. */ + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif +diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c +index 5974c65..71b2c7a 100644 +--- a/gcc/config/arm/arm.c ++++ b/gcc/config/arm/arm.c +@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code) + memsize = MEM_SIZE (x); + + /* Only certain alignment specifiers are supported by the hardware. */ +- if (memsize == 32 && (align % 32) == 0) ++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC ++ honors stricter alignment of composite type in user code, it doesn't ++ observe the alignment of memory passed as an extra argument for function ++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */ ++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED) + align_bits = 256; +- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0) ++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED) + align_bits = 128; + else if (memsize >= 8 && (align % 8) == 0) + align_bits = 64; +diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h +index ad123dd..97b059d 100644 +--- a/gcc/config/arm/arm.h ++++ b/gcc/config/arm/arm.h +@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes + + #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ + || (TARGET_THUMB1 \ ++ && !inline_thumb1_jump_table \ + && (optimize_size || flag_pic))) + + #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ +- (TARGET_THUMB1 \ ++ (TARGET_THUMB1 && !inline_thumb1_jump_table \ + ? (min >= 0 && max < 512 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ + : min >= -256 && max < 256 \ +diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md +index 47171b9..eb22d11 100644 +--- a/gcc/config/arm/arm.md ++++ b/gcc/config/arm/arm.md +@@ -8179,7 +8179,7 @@ + (match_operand:SI 2 "const_int_operand" "") ; total range + (match_operand:SI 3 "" "") ; table label + (match_operand:SI 4 "" "")] ; Out of range label +- "TARGET_32BIT || optimize_size || flag_pic" ++ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)" + " + { + enum insn_code code; +diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt +index 0ebe017..7724538 100644 +--- a/gcc/config/arm/arm.opt ++++ b/gcc/config/arm/arm.opt +@@ -193,6 +193,10 @@ mthumb-interwork + Target Report Mask(INTERWORK) + Support calls between Thumb and ARM instruction sets. + ++minline-thumb1-jumptable ++Target Report Var(inline_thumb1_jump_table) ++Inline Thumb1 Jump table code ++ + mtls-dialect= + Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU) + Specify thread local storage scheme. +diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h +index 77f3055..32158ed 100644 +--- a/gcc/config/arm/elf.h ++++ b/gcc/config/arm/elf.h +@@ -56,8 +56,7 @@ + #undef SUBSUBTARGET_EXTRA_SPECS + #define SUBSUBTARGET_EXTRA_SPECS + +-#ifndef ASM_SPEC +-#define ASM_SPEC "\ ++#define DEFAULT_ASM_SPEC "\ + %{mbig-endian:-EB} \ + %{mlittle-endian:-EL} \ + %(asm_cpu_spec) \ +@@ -66,6 +65,9 @@ + %{mthumb-interwork:-mthumb-interwork} \ + %{mfloat-abi=*} %{mfpu=*} \ + %(subtarget_extra_asm_spec)" ++ ++#ifndef ASM_SPEC ++#define ASM_SPEC DEFAULT_ASM_SPEC + #endif + + /* The ARM uses @ are a comment character so we need to redefine +@@ -104,7 +106,8 @@ + the code more efficient, but for Thumb-1 it's better to put them out of + band unless we are generating compressed tables. */ + #define JUMP_TABLES_IN_TEXT_SECTION \ +- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) ++ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \ ++ && (optimize_size || flag_pic))) + + #ifndef LINK_SPEC + #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" +diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h +index ace8481..ed7f4d0 100644 +--- a/gcc/config/arm/linux-eabi.h ++++ b/gcc/config/arm/linux-eabi.h +@@ -108,11 +108,16 @@ + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \ + GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \ +- ANDROID_CC1_SPEC) ++ ANDROID_CC1_SPEC("-fpic")) + + #define CC1PLUS_SPEC \ + LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + ++#undef ASM_SPEC ++#define ASM_SPEC \ ++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \ ++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC) ++ + #undef LIB_SPEC + #define LIB_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ +diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi +index 8f1307c..cbbec5b 100644 +--- a/gcc/config/arm/t-linux-androideabi ++++ b/gcc/config/arm/t-linux-androideabi +@@ -1,8 +1,9 @@ +-MULTILIB_OPTIONS = march=armv7-a mthumb +-MULTILIB_DIRNAMES = armv7-a thumb +-MULTILIB_EXCEPTIONS = ++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard ++MULTILIB_DIRNAMES = armv7-a thumb hard ++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard* + MULTILIB_MATCHES = + MULTILIB_OSDIRNAMES = ++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch + + # The "special" multilib can be used to build native applications for Android, + # as opposed to native shared libraries that are then called via JNI. +diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h +index 5ded869..5f51ac8 100644 +--- a/gcc/config/freebsd.h ++++ b/gcc/config/freebsd.h +@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see + #define LIB_SPEC FBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #ifdef TARGET_LIBC_PROVIDES_SSP + #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \ +diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h +index b0bf40a..d1874bc 100644 +--- a/gcc/config/gnu-user.h ++++ b/gcc/config/gnu-user.h +@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LINK_GCC_C_SEQUENCE_SPEC + #define LINK_GCC_C_SEQUENCE_SPEC \ +diff --git a/gcc/config/i386/arm_neon.h b/gcc/config/i386/arm_neon.h +new file mode 100644 +index 0000000..10944f3 +--- /dev/null ++++ b/gcc/config/i386/arm_neon.h +@@ -0,0 +1,16622 @@ ++//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com ++ ++//*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved. ++ ++//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. ++ ++//By downloading, copying, installing or using the software you agree to this license. ++//If you do not agree to this license, do not download, install, copy or use the software. ++ ++// License Agreement ++ ++//Permission to use, copy, modify, and/or distribute this software for any ++//purpose with or without fee is hereby granted, provided that the above ++//copyright notice and this permission notice appear in all copies. ++ ++//THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH ++//REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY ++//AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, ++//INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM ++//LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR ++//OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR ++//PERFORMANCE OF THIS SOFTWARE. ++ ++//***************************************************************************************** ++// This file is intended to simplify ARM->IA32 porting ++// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h") ++// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below ++// MMX instruction set is not used due to performance overhead and the necessity to use the ++// EMMS instruction (_mm_empty())for mmx-x87 floating point switching ++//***************************************************************************************** ++ ++//!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual. ++//!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for ++//!!!!!!! greater performance. It can be done by -msse4.2 compiler switch. ++ ++#ifndef NEON2SSE_H ++#define NEON2SSE_H ++ ++#ifndef USE_SSE4 ++#if defined(__SSE4_2__) ++ #define USE_SSE4 ++#endif ++#endif ++ ++#include //SSE ++#include //SSE2 ++#include //SSE3 ++#include //SSSE3 ++#ifdef USE_SSE4 ++#include //SSE4.1 ++#include //SSE4.2 ++#endif ++ ++ ++//*************** functions and data attributes, compiler dependent ********************************* ++//*********************************************************************************** ++#ifdef __GNUC__ ++#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) ++#define _NEON2SSE_ALIGN_16 __attribute__((aligned(16))) ++#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) ++#if _GCC_VERSION < 40500 ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function ++#else ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function ++#endif ++#if defined(__x86_64__) ++ #define _NEON2SSE_64BIT __x86_64__ ++#endif ++#else ++#define _NEON2SSE_ALIGN_16 __declspec(align(16)) ++#define _NEON2SSE_INLINE __inline ++#if defined(_MSC_VER)|| defined (__INTEL_COMPILER) ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function ++#if defined(_M_X64) ++ #define _NEON2SSE_64BIT _M_X64 ++#endif ++#else ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function ++#endif ++#endif ++ ++#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4) ++ #define _NEON2SSE_64BIT_SSE4 ++#endif ++ ++/*********************************************************************************************************************/ ++// data types conversion ++/*********************************************************************************************************************/ ++#if defined(_MSC_VER) && (_MSC_VER < 1300) ++ typedef signed char int8_t; ++ typedef unsigned char uint8_t; ++ typedef signed short int16_t; ++ typedef unsigned short uint16_t; ++ typedef signed int int32_t; ++ typedef unsigned int uint32_t; ++ typedef signed long long int64_t; ++ typedef unsigned long long uint64_t; ++#elif defined(_MSC_VER) ++ typedef signed __int8 int8_t; ++ typedef unsigned __int8 uint8_t; ++ typedef signed __int16 int16_t; ++ typedef unsigned __int16 uint16_t; ++ typedef signed __int32 int32_t; ++ typedef unsigned __int32 uint32_t; ++ ++ typedef signed long long int64_t; ++ typedef unsigned long long uint64_t; ++#else ++#include ++#include ++#endif ++ ++typedef union __m64_128 { ++ uint64_t m64_u64[1]; ++ float m64_f32[2]; ++ int8_t m64_i8[8]; ++ int16_t m64_i16[4]; ++ int32_t m64_i32[2]; ++ int64_t m64_i64[1]; ++ uint8_t m64_u8[8]; ++ uint16_t m64_u16[4]; ++ uint32_t m64_u32[2]; ++} __m64_128; ++ ++typedef __m64_128 int8x8_t; ++typedef __m64_128 uint8x8_t; ++typedef __m64_128 int16x4_t; ++typedef __m64_128 uint16x4_t; ++typedef __m64_128 int32x2_t; ++typedef __m64_128 uint32x2_t; ++typedef __m64_128 int64x1_t; ++typedef __m64_128 uint64x1_t; ++typedef __m64_128 poly8x8_t; ++typedef __m64_128 poly16x4_t; ++ ++typedef __m64_128 float32x2_t; ++typedef __m128 float32x4_t; ++ ++typedef __m128 float16x4_t; //not supported by IA, for compatibility ++typedef __m128 float16x8_t; //not supported by IA, for compatibility ++ ++typedef __m128i int8x16_t; ++typedef __m128i int16x8_t; ++typedef __m128i int32x4_t; ++typedef __m128i int64x2_t; ++typedef __m128i uint8x16_t; ++typedef __m128i uint16x8_t; ++typedef __m128i uint32x4_t; ++typedef __m128i uint64x2_t; ++typedef __m128i poly8x16_t; ++typedef __m128i poly16x8_t; ++ ++#if defined(_MSC_VER) ++ #define SINT_MIN (-2147483647 - 1) /* min signed int value */ ++ #define SINT_MAX 2147483647 /* max signed int value */ ++#else ++ #define SINT_MIN INT_MIN /* min signed int value */ ++ #define SINT_MAX INT_MAX /* max signed int value */ ++#endif ++ ++typedef float float32_t; ++#if !defined(__clang__) ++typedef float __fp16; ++#endif ++ ++typedef uint8_t poly8_t; ++typedef uint16_t poly16_t; ++ ++ ++//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in ++//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types ++struct int8x16x2_t { ++ int8x16_t val[2]; ++}; ++struct int16x8x2_t { ++ int16x8_t val[2]; ++}; ++struct int32x4x2_t { ++ int32x4_t val[2]; ++}; ++struct int64x2x2_t { ++ int64x2_t val[2]; ++}; ++//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!! ++struct int8x8x2_t { ++ int8x8_t val[2]; ++}; ++struct int16x4x2_t { ++ int16x4_t val[2]; ++}; ++struct int32x2x2_t { ++ int32x2_t val[2]; ++}; ++struct int64x1x2_t { ++ int64x1_t val[2]; ++}; ++ ++typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy ++typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy ++typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy ++typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy ++ ++typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy ++typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy ++typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy ++typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy ++ ++/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */ ++typedef struct int8x16x2_t uint8x16x2_t; ++typedef struct int16x8x2_t uint16x8x2_t; ++typedef struct int32x4x2_t uint32x4x2_t; ++typedef struct int64x2x2_t uint64x2x2_t; ++typedef struct int8x16x2_t poly8x16x2_t; ++typedef struct int16x8x2_t poly16x8x2_t; ++ ++typedef struct int8x8x2_t uint8x8x2_t; ++typedef struct int16x4x2_t uint16x4x2_t; ++typedef struct int32x2x2_t uint32x2x2_t; ++typedef struct int64x1x2_t uint64x1x2_t; ++typedef struct int8x8x2_t poly8x8x2_t; ++typedef struct int16x4x2_t poly16x4x2_t; ++ ++//float ++struct float32x4x2_t { ++ float32x4_t val[2]; ++}; ++struct float16x8x2_t { ++ float16x8_t val[2]; ++}; ++struct float32x2x2_t { ++ float32x2_t val[2]; ++}; ++ ++typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy ++typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy ++typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy ++typedef float16x8x2_t float16x4x2_t; ++ ++//4 ++struct int8x16x4_t { ++ int8x16_t val[4]; ++}; ++struct int16x8x4_t { ++ int16x8_t val[4]; ++}; ++struct int32x4x4_t { ++ int32x4_t val[4]; ++}; ++struct int64x2x4_t { ++ int64x2_t val[4]; ++}; ++ ++struct int8x8x4_t { ++ int8x8_t val[4]; ++}; ++struct int16x4x4_t { ++ int16x4_t val[4]; ++}; ++struct int32x2x4_t { ++ int32x2_t val[4]; ++}; ++struct int64x1x4_t { ++ int64x1_t val[4]; ++}; ++ ++typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy ++typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy ++typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy ++typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy ++ ++typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy ++typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy ++typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy ++typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy ++ ++/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ ++typedef struct int8x8x4_t uint8x8x4_t; ++typedef struct int16x4x4_t uint16x4x4_t; ++typedef struct int32x2x4_t uint32x2x4_t; ++typedef struct int64x1x4_t uint64x1x4_t; ++typedef struct int8x8x4_t poly8x8x4_t; ++typedef struct int16x4x4_t poly16x4x4_t; ++ ++typedef struct int8x16x4_t uint8x16x4_t; ++typedef struct int16x8x4_t uint16x8x4_t; ++typedef struct int32x4x4_t uint32x4x4_t; ++typedef struct int64x2x4_t uint64x2x4_t; ++typedef struct int8x16x4_t poly8x16x4_t; ++typedef struct int16x8x4_t poly16x8x4_t; ++ ++struct float32x4x4_t { ++ float32x4_t val[4]; ++}; ++struct float16x8x4_t { ++ float16x8_t val[4]; ++}; ++struct float32x2x4_t { ++ float32x2_t val[4]; ++}; ++ ++typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy ++typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy ++typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy ++typedef float16x8x4_t float16x4x4_t; ++ ++//3 ++struct int16x8x3_t { ++ int16x8_t val[3]; ++}; ++struct int32x4x3_t { ++ int32x4_t val[3]; ++}; ++struct int64x2x3_t { ++ int64x2_t val[3]; ++}; ++struct int8x16x3_t { ++ int8x16_t val[3]; ++}; ++ ++struct int16x4x3_t { ++ int16x4_t val[3]; ++}; ++struct int32x2x3_t { ++ int32x2_t val[3]; ++}; ++struct int64x1x3_t { ++ int64x1_t val[3]; ++}; ++struct int8x8x3_t { ++ int8x8_t val[3]; ++}; ++typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy ++typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy ++typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy ++typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy ++ ++typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy ++typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy ++typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy ++typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy ++ ++ ++/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ ++typedef struct int8x16x3_t uint8x16x3_t; ++typedef struct int16x8x3_t uint16x8x3_t; ++typedef struct int32x4x3_t uint32x4x3_t; ++typedef struct int64x2x3_t uint64x2x3_t; ++typedef struct int8x16x3_t poly8x16x3_t; ++typedef struct int16x8x3_t poly16x8x3_t; ++typedef struct int8x8x3_t uint8x8x3_t; ++typedef struct int16x4x3_t uint16x4x3_t; ++typedef struct int32x2x3_t uint32x2x3_t; ++typedef struct int64x1x3_t uint64x1x3_t; ++typedef struct int8x8x3_t poly8x8x3_t; ++typedef struct int16x4x3_t poly16x4x3_t; ++ ++//float ++struct float32x4x3_t { ++ float32x4_t val[3]; ++}; ++struct float32x2x3_t { ++ float32x2_t val[3]; ++}; ++struct float16x8x3_t { ++ float16x8_t val[3]; ++}; ++ ++typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy ++typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy ++typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy ++typedef float16x8x3_t float16x4x3_t; ++ ++ ++//**************************************************************************** ++//****** Porting auxiliary macros ******************************************** ++ ++//** floating point related macros ** ++#define _M128i(a) _mm_castps_si128(a) ++#define _M128(a) _mm_castsi128_ps(a) ++//here the most performance effective implementation is compiler and 32/64 bits build dependent ++#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) ) ++ ++ #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a))) ++ #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp); ++ #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp)); ++#else ++ //for 32bit gcc and Microsoft compilers builds ++ #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a)) ++ #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp) ++ #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp)) ++#endif ++#define _pM128(a) _mm_castsi128_ps(_pM128i(a)) ++ ++#define return64(a) _M64(res64,a); return res64; ++#define return64f(a) _M64f(res64,a); return res64; ++ ++#define _Ui64(a) (*(uint64_t*)&(a)) ++#define _UNSIGNED_T(a) u ## a ++ ++#define _SIGNBIT64 ((uint64_t)1 << 63) ++#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6)) ++#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) ) ++ ++#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it" ++#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it" ++ ++//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++#define __constrange(min,max) const ++#define __transfersize(size) ++//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++ ++//************************************************************************* ++//************************************************************************* ++//********* Functions declarations as declared in original arm_neon.h ***** ++//************************************************************************* ++//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes. ++int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 ++int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 ++int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 ++int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 ++float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 ++uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 ++uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 ++uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 ++uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 ++int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 ++int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 ++int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 ++int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 ++float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 ++uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 ++uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 ++uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 ++uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 ++//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. ++int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 ++int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 ++int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 ++uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 ++uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0 ++uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 ++//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i] ++int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 ++int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 ++int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 ++uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 ++uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0 ++uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 ++//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ++int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 ++int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 ++int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 ++uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0 ++uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0 ++uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 ++int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 ++int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0 ++int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 ++uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 ++uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0 ++uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 ++//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1 ++int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 ++int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 ++int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 ++uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 ++uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0 ++uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 ++int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 ++int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 ++int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 ++uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 ++uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0 ++uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 ++//Vector saturating add: vqadd -> Vr[i]:=sat(Va[i]+Vb[i]) ++int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 ++int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 ++int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 ++int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 ++uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 ++uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0 ++uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 ++uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 ++int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 ++int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 ++int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 ++int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 ++uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 ++uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0 ++uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 ++uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 ++//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i] ++int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 ++int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 ++int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 ++uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 ++uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 ++uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 ++//Vector rounding add high half: vraddhn ++int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 ++int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 ++int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 ++uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 ++uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 ++uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 ++//Multiplication ++//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] ++int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 ++int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 ++int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 ++float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 ++uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 ++uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 ++uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 ++poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 ++int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 ++int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 ++int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 ++float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 ++uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 ++uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 ++uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 ++poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 ++//multiply lane ++int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); ++int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); ++float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); ++uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); ++uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); ++int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c); ++int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); ++float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); ++uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); ++uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); ++//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ++int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 ++int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 ++int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 ++float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 ++uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 ++uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 ++uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 ++int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 ++int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 ++int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 ++float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 ++uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 ++uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 ++uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 ++//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i] ++int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 ++int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 ++int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 ++uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 ++uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0 ++uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 ++//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ++int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 ++int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 ++int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 ++float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 ++uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 ++uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 ++uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 ++int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 ++int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 ++int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 ++float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 ++uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 ++uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 ++uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 ++//Vector multiply subtract long ++int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 ++int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 ++int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 ++uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 ++uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0 ++uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 ++//Vector saturating doubling multiply high ++int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 ++int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 ++int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 ++int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 ++//Vector saturating rounding doubling multiply high ++int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 ++int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 ++int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 ++int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 ++//Vector saturating doubling multiply accumulate long ++int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 ++int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 ++//Vector saturating doubling multiply subtract long ++int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 ++int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 ++//Vector long multiply ++int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 ++int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 ++int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 ++uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 ++uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0 ++uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 ++poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 ++//Vector saturating doubling long multiply ++int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 ++int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 ++//Subtraction ++//Vector subtract ++int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 ++int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 ++int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 ++int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 ++float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 ++uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 ++uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 ++uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 ++uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 ++int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 ++int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 ++int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 ++int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 ++float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 ++uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 ++uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 ++uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 ++uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 ++//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i] ++int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 ++int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 ++int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 ++uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 ++uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0 ++uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 ++//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i] ++int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 ++int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 ++int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 ++uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 ++uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0 ++uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 ++//Vector saturating subtract ++int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 ++int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 ++int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 ++int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 ++uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 ++uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0 ++uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 ++uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 ++int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 ++int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 ++int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 ++int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 ++uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 ++uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0 ++uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 ++uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0 ++//Vector halving subtract ++int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 ++int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 ++int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 ++uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 ++uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0 ++uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 ++int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 ++int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 ++int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 ++uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 ++uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0 ++uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 ++//Vector subtract high half ++int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 ++int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 ++int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 ++uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 ++uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 ++uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 ++//Vector rounding subtract high half ++int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++//Comparison ++//Vector compare equal ++uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 ++uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 ++uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 ++uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 ++uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 ++uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 ++uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 ++uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 ++uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 ++uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 ++uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 ++uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 ++uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 ++uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 ++uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 ++uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 ++//Vector compare greater-than or equal ++uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 ++uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 ++uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 ++uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++//Vector compare less-than or equal ++uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 ++uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 ++uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 ++uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++//Vector compare greater-than ++uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 ++uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 ++uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++//Vector compare less-than ++uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 ++uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 ++uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++//Vector compare absolute greater-than or equal ++uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++//Vector compare absolute less-than or equal ++uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++//Vector compare absolute greater-than ++uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++//Vector compare absolute less-than ++uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++//Vector test bits ++uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 ++uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 ++uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 ++uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 ++uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 ++uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 ++uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 ++uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 ++uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 ++uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 ++uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 ++uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 ++uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 ++uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 ++//Absolute difference ++//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] | ++int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 ++int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 ++int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 ++uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 ++uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0 ++uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 ++float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 ++int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 ++int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 ++int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 ++uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 ++uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0 ++uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 ++float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 ++//Absolute difference - long ++int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 ++int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 ++int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 ++uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 ++uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0 ++uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 ++//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ++int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 ++int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 ++int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 ++uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 ++uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0 ++uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 ++int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 ++int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 ++int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 ++uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 ++uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0 ++uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 ++//Absolute difference and accumulate - long ++int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 ++int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 ++int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 ++uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 ++uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0 ++uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 ++//Max/Min ++//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ++int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 ++int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 ++int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 ++uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 ++uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0 ++uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 ++float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 ++int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 ++int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 ++int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 ++uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 ++uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 ++uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 ++float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 ++//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ++int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 ++int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 ++int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 ++uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 ++uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0 ++uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 ++float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 ++int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 ++int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 ++int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 ++uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 ++uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 ++uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 ++float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 ++//Pairwise addition ++//Pairwise add ++int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 ++int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 ++int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 ++uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 ++uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 ++uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 ++float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 ++//Long pairwise add ++int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 ++int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 ++int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 ++uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 ++uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0 ++uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 ++int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 ++int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 ++int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 ++uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 ++uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0 ++uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 ++//Long pairwise add and accumulate ++int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 ++int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 ++int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 ++uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 ++uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0 ++uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 ++int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 ++int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 ++int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 ++uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 ++uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0 ++uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 ++//Folding maximum vpmax -> takes maximum of adjacent pairs ++int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 ++int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 ++int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 ++uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 ++uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0 ++uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 ++float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 ++//Folding minimum vpmin -> takes minimum of adjacent pairs ++int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 ++int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 ++int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 ++uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 ++uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0 ++uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 ++float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 ++//Reciprocal/Sqrt ++float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 ++float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 ++float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 ++float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 ++//Shifts by signed variable ++//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ++int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 ++int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 ++int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 ++int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 ++uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 ++uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0 ++uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 ++uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 ++int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 ++int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 ++int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 ++int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 ++uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 ++uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0 ++uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 ++uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 ++//Vector saturating shift left: (negative values shift right) ++int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 ++int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 ++int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 ++int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 ++uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 ++uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0 ++uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 ++uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 ++int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 ++int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 ++int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 ++int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 ++uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 ++uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0 ++uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 ++uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 ++//Vector rounding shift left: (negative values shift right) ++int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 ++int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 ++int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 ++int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 ++uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 ++uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0 ++uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 ++uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 ++int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 ++int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 ++int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 ++int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 ++uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 ++uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0 ++uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 ++uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 ++//Vector saturating rounding shift left: (negative values shift right) ++int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 ++int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 ++int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 ++int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 ++uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 ++uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0 ++uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 ++uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 ++int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 ++int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 ++int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 ++int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 ++uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 ++uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0 ++uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 ++uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 ++//Shifts by a constant ++//Vector shift right by constant ++int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 ++int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 ++int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 ++int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 ++uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 ++uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16 ++uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 ++uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 ++int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 ++int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 ++int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 ++int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 ++uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 ++uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16 ++uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 ++uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 ++//Vector shift left by constant ++int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++//Vector rounding shift right by constant ++int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 ++int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 ++int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 ++int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 ++uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 ++uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16 ++uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 ++uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 ++int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 ++int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 ++int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 ++int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 ++uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 ++uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16 ++uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 ++uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 ++//Vector shift right by constant and accumulate ++int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 ++int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 ++int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 ++int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 ++uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 ++uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16 ++uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 ++uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 ++int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 ++int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 ++int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 ++int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 ++uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 ++uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16 ++uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 ++uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 ++//Vector rounding shift right by constant and accumulate ++int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 ++int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 ++int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 ++int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 ++uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 ++uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16 ++uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 ++uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 ++int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 ++int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 ++int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 ++int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 ++uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 ++uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16 ++uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 ++uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 ++//Vector saturating shift left by constant ++int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 ++int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 ++int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 ++int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 ++uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 ++uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0 ++uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 ++uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 ++int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 ++int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 ++int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 ++int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 ++uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 ++uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0 ++uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 ++uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 ++//Vector signed->unsigned saturating shift left by constant ++uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 ++uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 ++uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 ++uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 ++uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 ++uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 ++uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 ++uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 ++//Vector narrowing shift right by constant ++int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++//Vector signed->unsigned narrowing saturating shift right by constant ++uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 ++uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 ++uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 ++//Vector signed->unsigned rounding narrowing saturating shift right by constant ++uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 ++uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 ++uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 ++//Vector narrowing saturating shift right by constant ++int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 ++int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 ++int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 ++uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8 ++uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 ++uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 ++//Vector rounding narrowing shift right by constant ++int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++//Vector rounding narrowing saturating shift right by constant ++int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 ++int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 ++int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 ++uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8 ++uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 ++uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 ++//Vector widening shift left by constant ++int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 ++int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 ++int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 ++uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 ++uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0 ++uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 ++//Shifts with insert ++//Vector shift right and insert ++int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++//Vector shift left and insert ++int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++//Loads of a single vector or lane. Perform loads and stores of a single vector of some type. ++//Load a single vector from memory ++uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] ++float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] ++uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] ++uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] ++uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] ++int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] ++int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] ++int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] ++float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] ++poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] ++poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] ++//Load a single lane from memory ++uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] ++uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] ++int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] ++int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] ++int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0] ++float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] ++float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0] ++poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] ++poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] ++uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] ++uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] ++int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0] ++int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] ++float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] ++poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] ++poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++//Load all lanes of vector with same value from memory ++uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++//Store a single vector or lane. Stores all lanes or a single lane of a vector. ++//Store a single vector into memory ++void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] ++void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] ++void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] ++void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] ++void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] ++void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] ++void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] ++void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] ++void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] ++void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] ++void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] ++void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] ++void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] ++void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] ++void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] ++void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] ++void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] ++void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] ++void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] ++void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] ++//Store a lane of a vector into memory ++//Loads of an N-element structure ++//Load N-element structure from memory ++uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] ++float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] ++float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++//Load all lanes of N-element structure with same value from memory ++uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++//Load a single lane of N-element structure from memory ++//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned ++uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] ++int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] ++float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] ++poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] ++uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] ++int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0] ++int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0] ++//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] ++poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++//Store N-element structure to memory ++void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0] ++void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0] ++void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0] ++void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0] ++void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0] ++void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0] ++void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0] ++void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0] ++void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0] ++void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0] ++//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0] ++void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] ++void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] ++void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] ++void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] ++void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] ++void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] ++void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] ++void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] ++void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] ++void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] ++void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] ++void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] ++void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] ++void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] ++void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] ++void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] ++void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] ++void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] ++void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] ++void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] ++void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] ++void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] ++void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++//Store a single lane of N-element structure to memory ++void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] ++void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] ++void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0] ++void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] ++void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] ++void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0] ++void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] ++void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] ++void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] ++void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0] ++void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector. ++uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] ++uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] ++int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] ++int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] ++float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] ++uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] ++int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] ++int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] ++float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector. ++uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++//Initialize a vector from a literal bit pattern. ++int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 ++int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 ++int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 ++float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 ++float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 ++uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 ++uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 ++uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 ++uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 ++poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 ++poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 ++int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 ++//Set all lanes to same value ++//Load all lanes of vector to the same literal value ++uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 ++uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 ++uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 ++int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 ++int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 ++int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 ++poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 ++poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 ++float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 ++uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 ++int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 ++int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 ++poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 ++int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 ++int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 ++uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 ++uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 ++int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 ++int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 ++int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 ++poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 ++poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 ++float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 ++uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 ++int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 ++int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 ++poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 ++int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 ++int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++//Load all lanes of the vector to the value of a lane of a vector ++uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector. ++int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 ++int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 ++int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 ++int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 ++float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 ++float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 ++uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 ++uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 ++uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 ++uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 ++poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 ++poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 ++//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors ++int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 ++int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 ++int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 ++int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 ++float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 ++float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 ++uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 ++uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 ++uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 ++uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 ++poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 ++poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 ++int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 ++int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 ++int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 ++int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 ++float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 ++float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 ++uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 ++uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 ++uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 ++uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 ++poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 ++poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 ++//Converting vectors. These intrinsics are used to convert vectors. ++//Convert from float ++int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 ++uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 ++int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 ++uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 ++int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 ++uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 ++int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 ++uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 ++//Convert to float ++float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 ++float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 ++float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 ++float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 ++float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 ++float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 ++float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 ++float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 ++//Convert between floats ++float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 ++float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 ++//Vector narrow integer ++int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 ++int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 ++int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 ++uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 ++uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 ++uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 ++//Vector long move ++int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 ++int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 ++int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 ++uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 ++uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0 ++uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 ++//Vector saturating narrow integer ++int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 ++int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 ++int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 ++uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0 ++uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0 ++uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0 ++//Vector saturating narrow integer signed->unsigned ++uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 ++uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 ++uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 ++//Table look up ++uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 ++poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++//Extended table look up intrinsics ++uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 ++poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++//Operations with a scalar value ++//Vector multiply accumulate with scalar ++int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] ++int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] ++uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] ++uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] ++float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0] ++int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0] ++int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0] ++uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0] ++uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0] ++float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0] ++//Vector widening multiply accumulate with scalar ++int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0] ++int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0] ++uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0] ++uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0] ++//Vector widening saturating doubling multiply accumulate with scalar ++int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0] ++int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0] ++//Vector multiply subtract with scalar ++int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] ++int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] ++uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] ++uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] ++float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0] ++int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0] ++int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0] ++uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0] ++uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0] ++float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0] ++//Vector widening multiply subtract with scalar ++int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0] ++int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0] ++uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0] ++uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0] ++//Vector widening saturating doubling multiply subtract with scalar ++int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0] ++int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0] ++//Vector multiply by scalar ++int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] ++int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] ++float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] ++uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] ++uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] ++int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] ++int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] ++float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] ++uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] ++uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] ++//Vector long multiply with scalar ++int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] ++int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] ++uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0] ++uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] ++//Vector long multiply by scalar ++int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] ++int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] ++uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0] ++uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] ++//Vector saturating doubling long multiply with scalar ++int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] ++int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] ++//Vector saturating doubling long multiply by scalar ++int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] ++int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] ++//Vector saturating doubling multiply high with scalar ++int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] ++int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] ++int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] ++int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] ++//Vector saturating doubling multiply high by scalar ++int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] ++int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] ++int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] ++int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] ++//Vector saturating rounding doubling multiply high with scalar ++int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] ++int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] ++int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] ++int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] ++//Vector rounding saturating doubling multiply high by scalar ++int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] ++int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] ++int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] ++int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] ++//Vector multiply accumulate with scalar ++int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] ++int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] ++uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] ++uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] ++float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] ++int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] ++int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] ++uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] ++uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] ++float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] ++//Vector widening multiply accumulate with scalar ++int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] ++int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] ++uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0] ++uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] ++//Vector widening saturating doubling multiply accumulate with scalar ++int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] ++int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] ++//Vector multiply subtract with scalar ++int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] ++int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] ++uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] ++uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] ++float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] ++int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] ++int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] ++uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] ++uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] ++float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] ++//Vector widening multiply subtract with scalar ++int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] ++int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] ++uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0] ++uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] ++//Vector widening saturating doubling multiply subtract with scalar ++int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] ++int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] ++//Vector extract ++int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 ++uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 ++poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 ++int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 ++uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 ++int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 ++uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 ++float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 ++int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 ++uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 ++poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 ++int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 ++uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 ++poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 ++int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 ++uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 ++int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 ++uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 ++float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0 ++//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. ++int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0 ++int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0 ++int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0 ++uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0 ++uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0 ++uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0 ++poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0 ++poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0 ++float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0 ++int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 ++int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 ++int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 ++uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 ++uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 ++uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 ++poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 ++poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 ++float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 ++int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0 ++int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0 ++uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0 ++uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0 ++poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0 ++poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0 ++int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 ++int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 ++uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 ++uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 ++poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 ++poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0 ++int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0 ++uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0 ++poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0 ++int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 ++uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 ++poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 ++//Other single operand arithmetic ++//Absolute: Vd[i] = |Va[i]| ++int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0 ++int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0 ++int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0 ++float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0 ++int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 ++int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 ++int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 ++float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 ++//Saturating absolute: Vd[i] = sat(|Va[i]|) ++int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 ++int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 ++int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0 ++int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 ++int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 ++int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 ++//Negate: Vd[i] = - Va[i] ++int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0 ++int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0 ++int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0 ++float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0 ++int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 ++int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 ++int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 ++float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 ++//Saturating Negate: sat(Vd[i] = - Va[i]) ++int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0 ++int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0 ++int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0 ++int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 ++int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 ++int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 ++//Count leading sign bits ++int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 ++int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 ++int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 ++int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 ++int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 ++int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 ++//Count leading zeros ++int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0 ++int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0 ++int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0 ++uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0 ++uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0 ++uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0 ++int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 ++int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 ++int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 ++uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 ++uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 ++uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 ++//Count number of set bits ++uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 ++int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 ++poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 ++uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 ++int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 ++poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 ++//Reciprocal estimate ++float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 ++uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 ++float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 ++uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 ++//Reciprocal square root estimate ++float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 ++uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 ++float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 ++uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 ++//Logical operations ++//Bitwise not ++int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 ++int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 ++int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 ++uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 ++uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 ++uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 ++poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 ++int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 ++int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 ++int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 ++uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 ++uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 ++uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 ++poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 ++//Bitwise and ++int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 ++int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 ++int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 ++int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 ++uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 ++uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 ++uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 ++uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 ++int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 ++int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 ++int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 ++int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 ++uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 ++uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 ++uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 ++uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 ++//Bitwise or ++int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 ++int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 ++int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 ++int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 ++uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 ++uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 ++uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 ++uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 ++int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 ++int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 ++int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 ++int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 ++uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 ++uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 ++uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 ++uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 ++//Bitwise exclusive or (EOR or XOR) ++int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 ++int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 ++int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 ++int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 ++uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 ++uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 ++uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 ++uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 ++int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 ++int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 ++int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 ++int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 ++uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 ++uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 ++uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 ++uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 ++//Bit Clear ++int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 ++int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 ++int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 ++int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 ++uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 ++uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 ++uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 ++uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 ++int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 ++int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 ++int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 ++int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 ++uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 ++uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 ++uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 ++uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 ++//Bitwise OR complement ++int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 ++int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 ++int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 ++int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 ++uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 ++uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 ++uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 ++uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 ++int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 ++int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 ++int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 ++int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 ++uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 ++uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 ++uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 ++uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 ++//Bitwise Select ++int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 ++int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 ++int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 ++int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 ++uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 ++uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 ++uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 ++uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 ++float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 ++poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 ++poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 ++int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 ++int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 ++int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 ++int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 ++uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 ++uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 ++uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 ++uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 ++float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 ++poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 ++poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 ++//Transposition operations ++//Transpose elements ++int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 ++int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 ++int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 ++uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 ++uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 ++uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 ++float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 ++poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 ++poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 ++int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 ++int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 ++int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 ++uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 ++uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 ++uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 ++float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 ++poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 ++poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 ++//Interleave elements ++int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 ++int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 ++int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 ++uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 ++uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 ++uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 ++float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 ++poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 ++poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 ++int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 ++int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 ++int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 ++uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 ++uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 ++uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 ++float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 ++poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 ++poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 ++//De-Interleave elements ++int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 ++int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 ++int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 ++uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 ++uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 ++uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 ++float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 ++poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 ++poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 ++int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 ++int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 ++int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 ++uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 ++uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 ++uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 ++float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 ++poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 ++poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 ++ ++ ++//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must, ++//for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal ++// ++#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers. ++ ++ #define _MM_ALIGNR_EPI8 _mm_alignr_epi8 ++ ++ #define _MM_EXTRACT_EPI16 _mm_extract_epi16 ++ #define _MM_INSERT_EPI16 _mm_insert_epi16 ++#ifdef USE_SSE4 ++ #define _MM_EXTRACT_EPI8 _mm_extract_epi8 ++ #define _MM_EXTRACT_EPI32 _mm_extract_epi32 ++ #define _MM_EXTRACT_PS _mm_extract_ps ++ ++ #define _MM_INSERT_EPI8 _mm_insert_epi8 ++ #define _MM_INSERT_EPI32 _mm_insert_epi32 ++ #define _MM_INSERT_PS _mm_insert_ps ++#ifdef _NEON2SSE_64BIT ++ #define _MM_INSERT_EPI64 _mm_insert_epi64 ++ #define _MM_EXTRACT_EPI64 _mm_extract_epi64 ++#endif ++#endif //SSE4 ++#else ++ #define _NEON2SSE_COMMA , ++ #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \ ++ switch(LANE) \ ++ { \ ++ case 0: return NAME(a b, 0); \ ++ case 1: return NAME(a b, 1); \ ++ case 2: return NAME(a b, 2); \ ++ case 3: return NAME(a b, 3); \ ++ case 4: return NAME(a b, 4); \ ++ case 5: return NAME(a b, 5); \ ++ case 6: return NAME(a b, 6); \ ++ case 7: return NAME(a b, 7); \ ++ case 8: return NAME(a b, 8); \ ++ case 9: return NAME(a b, 9); \ ++ case 10: return NAME(a b, 10); \ ++ case 11: return NAME(a b, 11); \ ++ case 12: return NAME(a b, 12); \ ++ case 13: return NAME(a b, 13); \ ++ case 14: return NAME(a b, 14); \ ++ case 15: return NAME(a b, 15); \ ++ default: return NAME(a b, 0); \ ++ } ++ ++ #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \ ++ switch(LANE) \ ++ { \ ++ case 0: return NAME(vec p,0); \ ++ case 1: return NAME(vec p,1); \ ++ case 2: return NAME(vec p,2); \ ++ case 3: return NAME(vec p,3); \ ++ case 4: return NAME(vec p,4); \ ++ case 5: return NAME(vec p,5); \ ++ case 6: return NAME(vec p,6); \ ++ case 7: return NAME(vec p,7); \ ++ default: return NAME(vec p,0); \ ++ } ++ ++ #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \ ++ switch(LANE) \ ++ { \ ++ case case0: return NAME(vec p,case0); \ ++ case case1: return NAME(vec p,case1); \ ++ case case2: return NAME(vec p,case2); \ ++ case case3: return NAME(vec p,case3); \ ++ default: return NAME(vec p,case0); \ ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) ++ { ++ _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE) ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p) ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,) ++ } ++ ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,) ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,) ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE) ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p) ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE) ++ } ++ ++#ifdef _NEON2SSE_64BIT ++ //the special case of functions available only for SSE4 and 64-bit build. ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE) ++ { ++ switch(LANE) { ++ case 0: ++ return _mm_insert_epi64(vec, p, 0); ++ case 1: ++ return _mm_insert_epi64(vec, p, 1); ++ default: ++ return _mm_insert_epi64(vec, p, 0); ++ } ++ } ++ ++ _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE) ++ { ++ if (LANE ==0) return _mm_extract_epi64(val, 0); ++ else return _mm_extract_epi64(val, 1); ++ } ++#endif ++ ++ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p) ++ } ++ ++#endif //USE_SSE4 ++ ++#endif //#ifdef NDEBUG ++ ++//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices ++// or for some specific commonly used operations implementation missing in SSE ++#ifdef USE_SSE4 ++ #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16 ++ #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32 ++ #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64 ++ ++ #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16 ++ #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32 ++ #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64 ++ ++ #define _MM_MAX_EPI8 _mm_max_epi8 ++ #define _MM_MAX_EPI32 _mm_max_epi32 ++ #define _MM_MAX_EPU16 _mm_max_epu16 ++ #define _MM_MAX_EPU32 _mm_max_epu32 ++ ++ #define _MM_MIN_EPI8 _mm_min_epi8 ++ #define _MM_MIN_EPI32 _mm_min_epi32 ++ #define _MM_MIN_EPU16 _mm_min_epu16 ++ #define _MM_MIN_EPU32 _mm_min_epu32 ++ ++ #define _MM_BLENDV_EPI8 _mm_blendv_epi8 ++ #define _MM_PACKUS_EPI32 _mm_packus_epi32 ++ #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a) ++ ++ #define _MM_MULLO_EPI32 _mm_mullo_epi32 ++ #define _MM_MUL_EPI32 _mm_mul_epi32 ++ ++ #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64 ++#else //no SSE4 !!!!!! ++ _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ return _mm_unpacklo_epi8(a, zero); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ return _mm_unpacklo_epi16(a, zero); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ return _mm_unpacklo_epi32(a, zero); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i sign = _mm_cmpgt_epi8(zero, a); ++ return _mm_unpacklo_epi8(a, sign); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i sign = _mm_cmpgt_epi16(zero, a); ++ return _mm_unpacklo_epi16(a, sign); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i sign = _mm_cmpgt_epi32(zero, a); ++ return _mm_unpacklo_epi32(a, sign); ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t tmp[4]; ++ _mm_store_si128((__m128i*)tmp, vec); ++ return tmp[LANE]; ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int8_t tmp[16]; ++ _mm_store_si128((__m128i*)tmp, vec); ++ return (int)tmp[LANE]; ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t tmp[4]; ++ _mm_store_si128((__m128i*)tmp, _M128i(vec)); ++ return tmp[LANE]; ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0}; ++ _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; ++ __m128i vec_masked, p_masked; ++ pvec[LANE] = p; ++ mask[LANE] = 0x0; ++ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p ++ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec ++ return _mm_or_si128(vec_masked, p_masked); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; ++ _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; ++ __m128i vec_masked, p_masked; ++ pvec[LANE] = (int8_t)p; ++ mask[LANE] = 0x0; ++ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p ++ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec ++ return _mm_or_si128(vec_masked, p_masked); ++ } ++ ++ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; ++ __m128 tmp, vec_masked, p_masked; ++ mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it ++ vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p ++ p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec ++ tmp = _mm_or_ps(vec_masked, p_masked); ++ return tmp; ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi8 (a, b); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi32(a, b); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b) ++ { ++ __m128i c8000, b_s, a_s, cmp; ++ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff ++ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 ++ b_s = _mm_sub_epi16 (b, c8000); ++ a_s = _mm_sub_epi16 (a, c8000); ++ cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b) ++ { ++ __m128i c80000000, b_s, a_s, cmp; ++ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff ++ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 ++ b_s = _mm_sub_epi32 (b, c80000000); ++ a_s = _mm_sub_epi32 (a, c80000000); ++ cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi8 (b, a); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi32(b, a); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b) ++ { ++ __m128i c8000, b_s, a_s, cmp; ++ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff ++ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 ++ b_s = _mm_sub_epi16 (b, c8000); ++ a_s = _mm_sub_epi16 (a, c8000); ++ cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b) ++ { ++ __m128i c80000000, b_s, a_s, cmp; ++ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff ++ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 ++ b_s = _mm_sub_epi32 (b, c80000000); ++ a_s = _mm_sub_epi32 (a, c80000000); ++ cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below ++ { ++ //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters. ++ __m128i a_masked, b_masked; ++ b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff ++ a_masked = _mm_andnot_si128 (mask,a); ++ return _mm_or_si128(a_masked, b_masked); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b) ++ { ++ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; ++ __m128i a16, b16, res, reshi,cmp, zero; ++ zero = _mm_setzero_si128(); ++ a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); ++ b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); ++ res = _mm_unpacklo_epi64(a16, b16); //result without saturation ++ reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation ++ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero ++ res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 ++ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive ++ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a) ++ { ++ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; ++ __m128i a16, res, reshi,cmp, zero; ++ zero = _mm_setzero_si128(); ++ a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); ++ reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation ++ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero ++ res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 ++ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive ++ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff ++ } ++ ++ ++ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL) ++ { ++ _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4]; ++ int64_t res64; ++ int i; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ for (i = 0; i<4; i++) { ++ res64 = atmp[i] * btmp[i]; ++ res[i] = (int)(res64 & 0xffffffff); ++ } ++ return _mm_load_si128((__m128i*)res); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b) ++ { ++ __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg; ++ sign = _mm_xor_si128 (a, b); ++ sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive ++ zero = _mm_setzero_si128(); ++ a_neg = _mm_abs_epi32 (a); //negate a and b ++ b_neg = _mm_abs_epi32 (b); //negate a and b ++ mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result ++ mul_us_neg = _mm_sub_epi64(zero, mul_us); ++ mul_us_neg = _mm_and_si128(sign, mul_us_neg); ++ mul_us = _mm_andnot_si128(sign, mul_us); ++ return _mm_or_si128 (mul_us, mul_us_neg); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b) ++ { ++ __m128i res; ++ res = _mm_cmpeq_epi32 (a, b); ++ return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data ++ } ++#endif //SSE4 ++ ++//the special case of functions working only for 32 bits, no SSE4 ++_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0}; ++ _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff}; ++ __m128i vec_masked, p_masked; ++ pvec[LANE] = p; ++ mask[LANE] = 0x0; ++ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p ++ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec ++ return _mm_or_si128(vec_masked, p_masked); ++} ++ ++_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE) ++{ ++ _NEON2SSE_ALIGN_16 int64_t tmp[2]; ++ _mm_store_si128((__m128i*)tmp, val); ++ return tmp[LANE]; ++} ++ ++#ifndef _NEON2SSE_64BIT_SSE4 ++ #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32 ++ #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32 ++#endif ++ ++int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints ++_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a) ++{ ++ //Overflow happens only if a and sum have the opposite signs ++ __m128i c7fffffff, res, res_sat, res_xor_a; ++ c7fffffff = _mm_set1_epi32(0x7fffffff); ++ res = _mm_slli_epi32 (a, 1); // res = a*2 ++ res_sat = _mm_srli_epi32(a, 31); ++ res_sat = _mm_add_epi32(res_sat, c7fffffff); ++ res_xor_a = _mm_xor_si128(res, a); ++ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise ++ res_sat = _mm_and_si128(res_xor_a, res_sat); ++ res = _mm_andnot_si128(res_xor_a, res); ++ return _mm_or_si128(res, res_sat); ++} ++ ++ ++//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ++//************************************************************************* ++//************************************************************************* ++//***************** Functions redefinition\implementatin starts here ***** ++//************************************************************************* ++//************************************************************************* ++//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ++ ++/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample: ++#ifdef ARM ++#define vector_addq_s32 _mm_add_epi32 ++#else //if we have IA ++#define vector_addq_s32 vadd_s32 ++#endif ++ ++******************************************************************************************** ++Functions below are organised in the following way: ++ ++Each NEON intrinsic function has one of the following options: ++1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement ++2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement ++3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition ++4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance ++the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path ++- please consider such functions removal from your code. ++*/ ++ ++//*********************************************************************** ++//************************ Vector add ***************************** ++//*********************************************************************** ++int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_add_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_add_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_add_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res64; ++ res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0]; ++ return res64; ++} ++ ++ ++float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b) ++{ ++ __m128 res; ++ __m64_128 res64; ++ res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 ++#define vadd_u8 vadd_s8 ++ ++uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 ++#define vadd_u16 vadd_s16 ++ ++uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 ++#define vadd_u32 vadd_s32 ++ ++uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 ++_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) ++{ ++ uint64x1_t res64; ++ res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0]; ++ return res64; ++} ++ ++ ++int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 ++#define vaddq_s8 _mm_add_epi8 ++ ++int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 ++#define vaddq_s16 _mm_add_epi16 ++ ++int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 ++#define vaddq_s32 _mm_add_epi32 ++ ++int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 ++#define vaddq_s64 _mm_add_epi64 ++ ++float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 ++#define vaddq_f32 _mm_add_ps ++ ++uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 ++#define vaddq_u8 _mm_add_epi8 ++ ++uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 ++#define vaddq_u16 _mm_add_epi16 ++ ++uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 ++#define vaddq_u32 _mm_add_epi32 ++ ++uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 ++#define vaddq_u64 _mm_add_epi64 ++ ++//**************************** Vector long add *****************************: ++//*********************************************************************** ++//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. ++int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_add_epi16 (a16, b16); ++} ++ ++int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi32 (a32, b32); ++} ++ ++int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 ( a64, b64); ++} ++ ++uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1 ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi16 (a16, b16); ++} ++ ++uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi32 (a32, b32); ++} ++ ++uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 (a64, b64); ++} ++ ++//*************** Vector wide add: vaddw_. Vr[i]:=Va[i]+Vb[i] ****************** ++//*************** ********************************************************************* ++int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 ++_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_add_epi16 (a, b16); ++} ++ ++int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 ++_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1, ++ return _mm_add_epi32 (a, b32); ++} ++ ++int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 ++_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 (a, b64); ++} ++ ++uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 ++_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi16 (a, b16); ++} ++ ++uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0 ++_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi32 (a, b32); ++} ++ ++uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 ++_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 (a, b64); ++} ++ ++//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated ******************************* ++//************************************************************************************************************************* ++int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vhaddq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64( vhaddq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64( vhaddq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64( vhaddq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64( vhaddq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64( vhaddq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b) ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = vshrq_n_s8(tmp2,1); ++ return _mm_add_epi8(tmp1,tmp2); ++} ++ ++int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b) ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = _mm_srai_epi16(tmp2,1); ++ return _mm_add_epi16(tmp1,tmp2); ++} ++ ++int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0 ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = _mm_srai_epi32(tmp2,1); ++ return _mm_add_epi32(tmp1,tmp2); ++} ++ ++uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0 ++{ ++ __m128i c1, sum, res; ++ c1 = _mm_set1_epi8(1); ++ sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it ++ res = _mm_xor_si128(a, b); //for rounding compensation ++ res = _mm_and_si128(res,c1); //for rounding compensation ++ return _mm_sub_epi8 (sum, res); //actual rounding compensation ++} ++ ++uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0 ++{ ++ __m128i sum, res; ++ sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it ++ res = _mm_xor_si128(a, b); //for rounding compensation ++ res = _mm_slli_epi16 (res,15); //shift left then back right to ++ res = _mm_srli_epi16 (res,15); //get 1 or zero ++ return _mm_sub_epi16 (sum, res); //actual rounding compensation ++} ++ ++uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0 ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = _mm_srli_epi32(tmp2,1); ++ return _mm_add_epi32(tmp1,tmp2); ++} ++ ++//************************Vector rounding halving add: vrhadd{q}_. Vr[i]:=(Va[i]+Vb[i]+1)>>1 *************************** ++//***************************************************************************************************************************** ++int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vrhaddq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vrhaddq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vrhaddq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! ++} ++ ++ ++uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! ++} ++ ++ ++uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vrhaddq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0 ++{ ++ //no signed average in x86 SIMD, go to unsigned ++ __m128i c128, au, bu, sum; ++ c128 = _mm_set1_epi8(0x80); //-128 ++ au = _mm_sub_epi8(a, c128); //add 128 ++ bu = _mm_sub_epi8(b, c128); //add 128 ++ sum = _mm_avg_epu8(au, bu); ++ return _mm_add_epi8 (sum, c128); //sub 128 ++} ++ ++int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0 ++{ ++ //no signed average in x86 SIMD, go to unsigned ++ __m128i cx8000, au, bu, sum; ++ cx8000 = _mm_set1_epi16(0x8000); // - 32768 ++ au = _mm_sub_epi16(a, cx8000); //add 32768 ++ bu = _mm_sub_epi16(b, cx8000); //add 32768 ++ sum = _mm_avg_epu16(au, bu); ++ return _mm_add_epi16 (sum, cx8000); //sub 32768 ++} ++ ++int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b) ++{ ++ //need to avoid overflow ++ __m128i a2, b2, res, sum; ++ a2 = _mm_srai_epi32(a,1); //a2=a/2; ++ b2 = _mm_srai_epi32(b,1); // b2=b/2; ++ res = _mm_or_si128(a,b); //for rounding ++ res = _mm_slli_epi32 (res,31); //shift left then back right to ++ res = _mm_srli_epi32 (res,31); //get 1 or zero ++ sum = _mm_add_epi32(a2,b2); ++ return _mm_add_epi32(sum,res); ++} ++ ++uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 ++#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded ++ ++uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0 ++#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded ++ ++ ++uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0 ++{ ++ //need to avoid overflow ++ __m128i a2, b2, res, sum; ++ a2 = _mm_srli_epi32(a,1); //a2=a/2; ++ b2 = _mm_srli_epi32(b,1); // b2=b/2; ++ res = _mm_or_si128(a,b); //for rounding ++ res = _mm_slli_epi32 (res,31); //shift left then back right to ++ res = _mm_srli_epi32 (res,31); //get 1 or zero ++ sum = _mm_add_epi32(a2,b2); ++ return _mm_add_epi32(sum,res); ++} ++ ++//****************** VQADD: Vector saturating add ************************ ++//************************************************************************ ++int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_adds_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_adds_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vqaddq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int64x1_t res; ++ uint64_t a64, b64; ++ a64 = a.m64_u64[0]; ++ b64 = b.m64_u64[0]; ++ res.m64_u64[0] = a64 + b64; ++ a64 = (a64 >> 63) + (~_SIGNBIT64); ++ if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) { ++ res.m64_u64[0] = a64; ++ } ++ return res; ++} ++ ++uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_adds_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_adds_epu16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vqaddq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t a64, b64; ++ uint64x1_t res; ++ a64 = a.m64_u64[0]; ++ b64 = b.m64_u64[0]; ++ res.m64_u64[0] = a64 + b64; ++ if (res.m64_u64[0] < a64) { ++ res.m64_u64[0] = ~(uint64_t)0; ++ } ++ return res; ++} ++ ++int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 ++#define vqaddq_s8 _mm_adds_epi8 ++ ++int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 ++#define vqaddq_s16 _mm_adds_epi16 ++ ++int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b) ++{ ++ //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign ++ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_; ++ c7fffffff = _mm_set1_epi32(0x7fffffff); ++ res = _mm_add_epi32(a, b); ++ res_sat = _mm_srli_epi32(a, 31); ++ res_sat = _mm_add_epi32(res_sat, c7fffffff); ++ res_xor_a = _mm_xor_si128(res, a); ++ b_xor_a_ = _mm_xor_si128(b, a); ++ res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a); ++ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise ++ res_sat = _mm_and_si128(res_xor_a, res_sat); ++ res = _mm_andnot_si128(res_xor_a, res); ++ return _mm_or_si128(res, res_sat); ++} ++ ++int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = atmp[0] + btmp[0]; ++ res[1] = atmp[1] + btmp[1]; ++ ++ atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64); ++ atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64); ++ ++ if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) { ++ res[0] = atmp[0]; ++ } ++ if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) { ++ res[1] = atmp[1]; ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 ++#define vqaddq_u8 _mm_adds_epu8 ++ ++uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0 ++#define vqaddq_u16 _mm_adds_epu16 ++ ++uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b) ++{ ++ __m128i c80000000, cmp, subsum, suba, sum; ++ c80000000 = _mm_set1_epi32 (0x80000000); ++ sum = _mm_add_epi32 (a, b); ++ subsum = _mm_sub_epi32 (sum, c80000000); ++ suba = _mm_sub_epi32 (a, c80000000); ++ cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed ++ return _mm_or_si128 (sum, cmp); //saturation ++} ++ ++uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b) ++ { ++ __m128i c80000000, sum, cmp, suba, subsum; ++ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); ++ sum = _mm_add_epi64 (a, b); ++ subsum = _mm_sub_epi64 (sum, c80000000); ++ suba = _mm_sub_epi64 (a, c80000000); ++ cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!! ++ return _mm_or_si128 (sum, cmp); //saturation ++ } ++#else ++ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++ { ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = atmp[0] + btmp[0]; ++ res[1] = atmp[1] + btmp[1]; ++ if (res[0] < atmp[0]) res[0] = ~(uint64_t)0; ++ if (res[1] < atmp[1]) res[1] = ~(uint64_t)0; ++ return _mm_load_si128((__m128i*)(res)); ++ } ++#endif ++ ++ ++//******************* Vector add high half (truncated) ****************** ++//************************************************************************ ++int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sum; ++ sum = _mm_add_epi16 (a, b); ++ sum = _mm_srai_epi16 (sum, 8); ++ sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only ++ return64(sum); ++} ++ ++int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0 ++{ ++ int16x4_t res64; ++ __m128i sum; ++ sum = _mm_add_epi32 (a, b); ++ sum = _mm_srai_epi32(sum, 16); ++ sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only ++ return64(sum); ++} ++ ++int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b) ++{ ++ int32x2_t res64; ++ __m128i sum; ++ sum = _mm_add_epi64 (a, b); ++ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6)); ++ return64(sum); ++} ++ ++uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sum; ++ sum = _mm_add_epi16 (a, b); ++ sum = _mm_srli_epi16 (sum, 8); ++ sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only ++ return64(sum); ++} ++ ++uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0 ++{ ++ uint16x4_t res64; ++ __m128i sum; ++ sum = _mm_add_epi32 (a, b); ++ sum = _mm_srli_epi32 (sum, 16); ++ sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only ++ return64(sum); ++} ++ ++uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 ++#define vaddhn_u64 vaddhn_s64 ++ ++//*********** Vector rounding add high half: vraddhn_ ******************. ++//*************************************************************************** ++int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sum = _mm_srai_epi16 (sum, 8); //get high half ++ sum = _mm_add_epi16 (sum, mask1); //actual rounding ++ sum = _mm_packs_epi16 (sum, sum); ++ return64(sum); ++} ++ ++int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0 ++{ ++ //SIMD may be not optimal, serial may be faster ++ int16x4_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sum = _mm_srai_epi32 (sum, 16); //get high half ++ sum = _mm_add_epi32 (sum, mask1); //actual rounding ++ sum = _mm_packs_epi32 (sum, sum); ++ return64(sum); ++} ++ ++int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b) ++{ ++ //SIMD may be not optimal, serial may be faster ++ int32x2_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi64 (a, b); ++ mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero ++ sum = _mm_add_epi64 (sum, mask1); //actual high half rounding ++ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6)); ++ return64(sum); ++} ++ ++uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sum = _mm_srai_epi16 (sum, 8); //get high half ++ sum = _mm_add_epi16 (sum, mask1); //actual rounding ++ sum = _mm_packus_epi16 (sum, sum); ++ return64(sum); ++} ++ ++uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b) ++{ ++ //SIMD may be not optimal, serial may be faster ++ uint16x4_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sum = _mm_srai_epi32 (sum, 16); //get high half ++ sum = _mm_add_epi32 (sum, mask1); //actual rounding ++ sum = _MM_PACKUS1_EPI32 (sum); ++ return64(sum); ++} ++ ++uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 ++#define vraddhn_u64 vraddhn_s64 ++ ++//********************************************************************************** ++//********* Multiplication ************************************* ++//************************************************************************************** ++ ++//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] ++//As we don't go to wider result functions are equal to "multiply low" in x86 ++int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits in SSE ++ int8x8_t res64; ++ __m128i a128, b128, res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits ++ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits ++ res = _mm_mullo_epi16 (a128, b128); ++ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only ++ return64(res); ++} ++ ++int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 ++#define vmul_s16 vmul_u16 ++ ++int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 ++#define vmul_s32 vmul_u32 ++ ++float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x4_t tmp; ++ __m64_128 res64; ++ tmp = _mm_mul_ps(_pM128(a),_pM128(b)); ++ _M64f(res64, tmp); //use low 64 bits ++ return res64; ++} ++ ++uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits in SSE ++ uint8x8_t res64; ++ __m128i mask, a128, b128, res; ++ mask = _mm_set1_epi16(0xff); ++ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); ++ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); ++ res = _mm_mullo_epi16 (a128, b128); ++ res = _mm_and_si128(res, mask); //to avoid saturation ++ res = _mm_packus_epi16 (res,res); //use only low 64 bits ++ return64(res); ++} ++ ++uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0]; ++ res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1]; ++ return res; ++} ++ ++poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 ++_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b) ++{ ++ //may be optimized ++ poly8x8_t res64; ++ __m128i a64, b64, c1, res, tmp, bmasked; ++ int i; ++ a64 = _pM128i(a); ++ b64 = _pM128i(b); ++ c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff.... ++ c1 = vshrq_n_u8(c1,7); //0x1 ++ bmasked = _mm_and_si128(b64, c1); //0x1 ++ res = vmulq_u8(a64, bmasked); ++ for(i = 1; i<8; i++) { ++ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here ++ bmasked = _mm_and_si128(b64, c1); //0x1 ++ tmp = vmulq_u8(a64, bmasked); ++ res = _mm_xor_si128(res, tmp); ++ } ++ return64 (res); ++} ++ ++int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits ++ //solution may be not optimal ++ __m128i a16, b16, r16_1, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (a16, b16); ++ //swap hi and low part of a and b to process the remaining data ++ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2 ++ ++ r16_2 = _mm_mullo_epi16 (a16, b16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit ++ ++ return _mm_unpacklo_epi64(r16_1, r16_2); ++} ++ ++int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 ++#define vmulq_s16 _mm_mullo_epi16 ++ ++int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 ++#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1 ++ ++float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 ++#define vmulq_f32 _mm_mul_ps ++ ++uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits ++ //solution may be not optimal ++ __m128i maskff, a16, b16, r16_1, r16_2; ++ maskff = _mm_set1_epi16(0xff); ++ a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1 ++ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (a16, b16); ++ r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation ++ //swap hi and low part of a and b to process the remaining data ++ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (a16, b16); ++ r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation ++ return _mm_packus_epi16 (r16_1, r16_2); ++} ++ ++uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 ++#define vmulq_u16 _mm_mullo_epi16 ++ ++uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 ++#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1 ++ ++poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 ++_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b) ++{ ++ //may be optimized ++ __m128i c1, res, tmp, bmasked; ++ int i; ++ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... ++ c1 = vshrq_n_u8(c1,7); //0x1 ++ bmasked = _mm_and_si128(b, c1); //0x1 ++ res = vmulq_u8(a, bmasked); ++ for(i = 1; i<8; i++) { ++ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here ++ bmasked = _mm_and_si128(b, c1); //0x1 ++ tmp = vmulq_u8(a, bmasked); ++ res = _mm_xor_si128(res, tmp); ++ } ++ return res; ++} ++ ++//************************* Vector long multiply *********************************** ++//**************************************************************************** ++int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0 ++{ ++ //no 8 bit simd multiply, need to go to 16 bits ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 ++ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit ++} ++ ++int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0 ++{ ++ #ifdef USE_SSE4 ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1 ++ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 ++ #else ++ __m128i low, hi, a128,b128; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ low = _mm_mullo_epi16(a128,b128); ++ hi = _mm_mulhi_epi16(a128,b128); ++ return _mm_unpacklo_epi16(low,hi); ++ #endif ++} ++ ++int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0 ++{ ++ __m128i ab, ba, a128, b128; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 ++ return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++} ++ ++uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0 ++{ ++ //no 8 bit simd multiply, need to go to 16 bits ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 ++ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit ++} ++ ++uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0 ++{ ++ #ifdef USE_SSE4 ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1 ++ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 ++ #else ++ __m128i a128,b128,low, hi; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ low = _mm_mullo_epi16(a128,b128); ++ hi = _mm_mulhi_epu16(a128,b128); ++ return _mm_unpacklo_epi16(low,hi); ++ #endif ++} ++ ++uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0 ++{ ++ ///may be not optimal compared with serial implementation ++ __m128i ab, ba, a128, b128; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 ++ return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++} ++ ++poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 ++_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) ++{ ++ //may be optimized ++ __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked; ++ int i; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff.... ++ c1 = vshrq_n_u8(c1,7); //0x1 ++ bmasked = _mm_and_si128(b128, c1); //0x1 ++ ++ a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1 ++ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 ++ res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit ++ for(i = 1; i<8; i++) { ++ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here ++ bmasked = _mm_and_si128(b128, c1); //0x1 ++ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 ++ tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked); ++ res = _mm_xor_si128(res, tmp); ++ } ++ return res; ++} ++ ++//****************Vector saturating doubling long multiply ************************** ++//***************************************************************** ++int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b) ++{ ++ //the serial soulution may be faster due to saturation ++ __m128i res; ++ res = vmull_s16(a, b); ++ return vqd_s32(res); ++} ++ ++int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //the serial soulution may be faster due to saturation ++ __m128i res; ++ res = vmull_s32(a,b); ++ return vqaddq_s64(res,res); //slow serial function!!!! ++} ++ ++//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************ ++//****************************************************************************************** ++int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0 ++{ ++ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits ++ int8x8_t res64; ++ __m128i b128, c128, res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits ++ c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits ++ res = _mm_mullo_epi16 (c128, b128); ++ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); ++ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits ++ return64(res); ++} ++ ++int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) ++{ ++ int16x4_t res64; ++ return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); ++} ++ ++ ++int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0 ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1 ++ res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits ++ return64(res); ++} ++ ++float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) ++{ ++ //fma is coming soon, but right now: ++ __m128 res; ++ __m64_128 res64; ++ res = _mm_mul_ps (_pM128(c), _pM128(b)); ++ res = _mm_add_ps (_pM128(a), res); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0 ++{ ++ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits ++ uint8x8_t res64; ++ __m128i mask, b128, c128, res; ++ mask = _mm_set1_epi16(0xff); ++ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits ++ c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits ++ res = _mm_mullo_epi16 (c128, b128); ++ res = _mm_and_si128(res, mask); //to avoid saturation ++ res = _mm_packus_epi16 (res, res); ++ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits ++ return64(res); ++} ++ ++uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 ++#define vmla_u16 vmla_s16 ++ ++uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 ++#define vmla_u32 vmla_s32 ++ ++int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2,r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits ++ r16_1 = _mm_add_epi8 (r16_1, a); ++ //swap hi and low part of a, b and c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_add_epi8(r16_2, a_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_mullo_epi16 (c, b); ++ return _mm_add_epi16 (res, a); ++} ++ ++int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0 ++{ ++ __m128i res; ++ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 ++ return _mm_add_epi32 (res, a); ++} ++ ++float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0 ++{ ++ //fma is coming soon, but right now: ++ __m128 res; ++ res = _mm_mul_ps (c, b); ++ return _mm_add_ps (a, res); ++} ++ ++uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits ++ r16_1 = _mm_add_epi8 (r16_1, a); ++ //swap hi and low part of a, b and c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_add_epi8(r16_2, a_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 ++#define vmlaq_u16 vmlaq_s16 ++ ++uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 ++#define vmlaq_u32 vmlaq_s32 ++ ++//********************** Vector widening multiply accumulate (long multiply accumulate): ++// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************** ++//******************************************************************************************** ++int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0 ++{ ++ int16x8_t res; ++ res = vmull_s8(b, c); ++ return _mm_add_epi16 (res, a); ++} ++ ++int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int32x4_t res; ++ res = vmull_s16(b, c); ++ return _mm_add_epi32 (res, a); ++} ++ ++int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_s32( b, c); ++ return _mm_add_epi64 (res, a); ++} ++ ++uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0 ++{ ++ uint16x8_t res; ++ res = vmull_u8(b, c); ++ return _mm_add_epi16 (res, a); ++} ++ ++uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ uint32x4_t res; ++ res = vmull_u16(b, c); ++ return _mm_add_epi32 (res, a); ++} ++ ++uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_u32( b,c); ++ return _mm_add_epi64 (res, a); ++} ++ ++//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] *************************************** ++//******************************************************************************************** ++int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits ++ int8x8_t res64; ++ __m128i res; ++ res64 = vmul_s8(b,c); ++ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); ++ return64(res); ++} ++ ++int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) ++{ ++ int16x4_t res64; ++ return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); ++} ++ ++ ++int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0 ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1 ++ res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only ++ return64(res); ++} ++ ++float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) ++{ ++ __m128 res; ++ __m64_128 res64; ++ res = _mm_mul_ps (_pM128(c), _pM128(b)); ++ res = _mm_sub_ps (_pM128(a), res); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) ++{ ++ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits ++ uint8x8_t res64; ++ __m128i res; ++ res64 = vmul_u8(b,c); ++ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); ++ return64(res); ++} ++ ++uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 ++#define vmls_u16 vmls_s16 ++ ++uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 ++#define vmls_u32 vmls_s32 ++ ++ ++int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); ++ r16_1 = _mm_sub_epi8 (a, r16_1); ++ //swap hi and low part of a, b, c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_sub_epi8 (a_2, r16_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_mullo_epi16 (c, b); ++ return _mm_sub_epi16 (a, res); ++} ++ ++int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0 ++{ ++ __m128i res; ++ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 ++ return _mm_sub_epi32 (a, res); ++} ++ ++float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0 ++{ ++ __m128 res; ++ res = _mm_mul_ps (c, b); ++ return _mm_sub_ps (a, res); ++} ++ ++uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits ++ r16_1 = _mm_sub_epi8 (a, r16_1); ++ //swap hi and low part of a, b and c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_sub_epi8(a_2, r16_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 ++#define vmlsq_u16 vmlsq_s16 ++ ++uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 ++#define vmlsq_u32 vmlsq_s32 ++ ++//******************** Vector multiply subtract long (widening multiply subtract) ************************************ ++//************************************************************************************************************* ++int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0 ++{ ++ int16x8_t res; ++ res = vmull_s8(b, c); ++ return _mm_sub_epi16 (a, res); ++} ++ ++int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int32x4_t res; ++ res = vmull_s16(b, c); ++ return _mm_sub_epi32 (a, res); ++} ++ ++int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_s32( b,c); ++ return _mm_sub_epi64 (a, res); ++} ++ ++uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0 ++{ ++ uint16x8_t res; ++ res = vmull_u8(b, c); ++ return _mm_sub_epi16 (a, res); ++} ++ ++uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ uint32x4_t res; ++ res = vmull_u16(b, c); ++ return _mm_sub_epi32 (a, res); ++} ++ ++uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_u32( b,c); ++ return _mm_sub_epi64 (a, res); ++} ++ ++//****** Vector saturating doubling multiply high ********************** ++//************************************************************************* ++int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int16x4_t res; ++ int32_t a32, b32, i; ++ for (i = 0; i<4; i++) { ++ a32 = (int32_t) a.m64_i16[i]; ++ b32 = (int32_t) b.m64_i16[i]; ++ a32 = (a32 * b32) >> 15; ++ res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32; ++ } ++ return res; ++} ++ ++int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster ++{ ++ //may be not optimal compared with a serial solution ++ int32x2_t res64; ++ __m128i mask; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ int64x2_t mul; ++ mul = vmull_s32(a,b); ++ mul = _mm_slli_epi64(mul,1); //double the result ++ //at this point start treating 2 64-bit numbers as 4 32-bit ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++ return64(mul); ++} ++ ++int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0 ++{ ++ __m128i res, res_lo, mask; ++ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; ++ res = _mm_mulhi_epi16 (a, b); ++ res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation ++ res_lo = _mm_mullo_epi16 (a, b); ++ res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit ++ res = _mm_add_epi16(res, res_lo); //combine results ++ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); ++ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 ++} ++ ++int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target ++ __m128i ab, ba, mask, mul, mul1; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 ++ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul = _mm_slli_epi64(mul,1); //double the result ++ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 ++ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 ++ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul1 = _mm_slli_epi64(mul1,1); //double the result ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits ++ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits ++ mul = _mm_unpacklo_epi64(mul, mul1); ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++} ++ ++//********* Vector saturating rounding doubling multiply high **************** ++//**************************************************************************** ++//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order ++int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //may be not optimal compared with a serial solution ++ int32x2_t res64; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ __m128i res_sat, mask, mask1; ++ int64x2_t mul; ++ mul = vmull_s32(a,b); ++ res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered ++ mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero ++ mul = _mm_add_epi32 (res_sat, mask1); //actual rounding ++ //at this point start treating 2 64-bit numbers as 4 32-bit ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++ return64(mul); ++} ++ ++int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0 ++{ ++ __m128i mask, res; ++ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; ++ res = _mm_mulhrs_epi16 (a, b); ++ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); ++ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 ++} ++ ++int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target ++ __m128i ab, ba, mask, mul, mul1, mask1; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 ++ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered ++ mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero ++ mul = _mm_add_epi32 (mul, mask1); //actual rounding ++ ++ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 ++ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 ++ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered ++ mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero ++ mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding ++ //at this point start treating 2 64-bit numbers as 4 32-bit ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit ++ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit ++ mul = _mm_unpacklo_epi64(mul, mul1); ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++} ++ ++//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) ***** ++//************************************************************************************************************************* ++int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0 ++{ ++ //not optimal SIMD soulution, serial may be faster ++ __m128i res32; ++ res32 = vmull_s16(b, c); ++ res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1); ++ return vqaddq_s32(res32, a); //saturation ++} ++ ++int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res64; ++ res64 = vmull_s32(b,c); ++ res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1); ++ return vqaddq_s64(res64, a); //saturation ++} ++ ++//************************************************************************************ ++//****************** Vector subtract *********************************************** ++//************************************************************************************ ++int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_sub_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_sub_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_sub_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res64; ++ res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0]; ++ return res64; ++} ++ ++ ++float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0]; ++ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1]; ++ return res; ++} ++ ++uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 ++#define vsub_u8 vsub_s8 ++ ++uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 ++#define vsub_u16 vsub_s16 ++ ++uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 ++#define vsub_u32 vsub_s32 ++ ++ ++uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 ++_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b) ++{ ++ int64x1_t res64; ++ res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0]; ++ return res64; ++} ++ ++ ++int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 ++#define vsubq_s8 _mm_sub_epi8 ++ ++int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 ++#define vsubq_s16 _mm_sub_epi16 ++ ++int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 ++#define vsubq_s32 _mm_sub_epi32 ++ ++int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 ++#define vsubq_s64 _mm_sub_epi64 ++ ++float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 ++#define vsubq_f32 _mm_sub_ps ++ ++uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 ++#define vsubq_u8 _mm_sub_epi8 ++ ++uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 ++#define vsubq_u16 _mm_sub_epi16 ++ ++uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 ++#define vsubq_u32 _mm_sub_epi32 ++ ++uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 ++#define vsubq_u64 _mm_sub_epi64 ++ ++//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ****************** ++//*********************************************************************************** ++//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. ++int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a16, b16); ++} ++ ++int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a32, b32); ++} ++ ++int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi64 (a64, b64); ++} ++ ++uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a16, b16); ++} ++ ++uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a32, b32); ++} ++ ++uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi64 (a64, b64); ++} ++ ++//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ********************************** ++//***************************************************************************************************** ++int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 ++_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a, b16); ++} ++ ++int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 ++_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a, b32); ++} ++ ++int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 ++_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_sub_epi64 (a, b64); ++} ++ ++uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 ++_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a, b16); ++} ++ ++uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0 ++_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a, b32); ++} ++ ++uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 ++_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_sub_epi64 (a, b64); ++} ++ ++//************************Vector saturating subtract ********************************* ++//************************************************************************************* ++int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_subs_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_subs_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vqsubq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution ++{ ++ uint64x1_t res; ++ uint64_t a64,b64; ++ a64 = a.m64_u64[0]; ++ b64 = b.m64_u64[0]; ++ res.m64_u64[0] = a64 - b64; ++ ++ a64 = (a64 >> 63) + (~_SIGNBIT64); ++ if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) { ++ res.m64_u64[0] = a64; ++ } ++ return res; ++} ++ ++uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_subs_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_subs_epu16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vqsubq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint64x1_t res; ++ uint64_t a64, b64; ++ a64 = _Ui64(a); ++ b64 = _Ui64(b); ++ if (a64 > b64) { ++ res.m64_u64[0] = a64 - b64; ++ } else { ++ res.m64_u64[0] = 0; ++ } ++ return res; ++} ++ ++int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 ++#define vqsubq_s8 _mm_subs_epi8 ++ ++int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 ++#define vqsubq_s16 _mm_subs_epi16 ++ ++int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b) ++{ ++ //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a ++ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a; ++ c7fffffff = _mm_set1_epi32(0x7fffffff); ++ res = _mm_sub_epi32(a, b); ++ res_sat = _mm_srli_epi32(a, 31); ++ res_sat = _mm_add_epi32(res_sat, c7fffffff); ++ res_xor_a = _mm_xor_si128(res, a); ++ b_xor_a = _mm_xor_si128(b, a); ++ res_xor_a = _mm_and_si128(b_xor_a, res_xor_a); ++ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise ++ res_sat = _mm_and_si128(res_xor_a, res_sat); ++ res = _mm_andnot_si128(res_xor_a, res); ++ return _mm_or_si128(res, res_sat); ++} ++ ++int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution ++{ ++ _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2]; ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = atmp[0] - btmp[0]; ++ res[1] = atmp[1] - btmp[1]; ++ if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) { ++ res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64; ++ } ++ if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) { ++ res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64; ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 ++#define vqsubq_u8 _mm_subs_epu8 ++ ++uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0 ++#define vqsubq_u16 _mm_subs_epu16 ++ ++uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0 ++{ ++ __m128i min, mask, sub; ++ min = _MM_MIN_EPU32(a, b); //SSE4.1 ++ mask = _mm_cmpeq_epi32 (min, b); ++ sub = _mm_sub_epi32 (a, b); ++ return _mm_and_si128 ( sub, mask); ++} ++ ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b) ++ { ++ __m128i c80000000, subb, suba, cmp, sub; ++ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); ++ sub = _mm_sub_epi64 (a, b); ++ suba = _mm_sub_epi64 (a, c80000000); ++ subb = _mm_sub_epi64 (b, c80000000); ++ cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!! ++ return _mm_and_si128 (sub, cmp); //saturation ++ } ++#else ++ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++ { ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0; ++ res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0; ++ return _mm_load_si128((__m128i*)(res)); ++ } ++#endif ++ ++//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ****************************************************** ++//**************************************************************** ++int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0 ++{ ++ //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit, ++ int8x8_t res64; ++ __m128i r16; ++ int8x8_t r; ++ r = vsub_s8 (a, b); ++ r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1 ++ r16 = _mm_srai_epi16 (r16, 1); //SSE2 ++ r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits ++ return64(r16); ++} ++ ++int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vhsubq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++ ++int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vhsubq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vhsubq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vhsubq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vhsubq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0 ++{ ++ // //need to deal with the possibility of internal overflow ++ __m128i c128, au,bu; ++ c128 = _mm_set1_epi8 (128); ++ au = _mm_add_epi8( a, c128); ++ bu = _mm_add_epi8( b, c128); ++ return vhsubq_u8(au,bu); ++} ++ ++int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0 ++{ ++ //need to deal with the possibility of internal overflow ++ __m128i c8000, au,bu; ++ c8000 = _mm_set1_epi16(0x8000); ++ au = _mm_add_epi16( a, c8000); ++ bu = _mm_add_epi16( b, c8000); ++ return vhsubq_u16(au,bu); ++} ++ ++int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0 ++{ ++ //need to deal with the possibility of internal overflow ++ __m128i a2, b2,r, b_1; ++ a2 = _mm_srai_epi32 (a,1); ++ b2 = _mm_srai_epi32 (b,1); ++ r = _mm_sub_epi32 (a2, b2); ++ b_1 = _mm_andnot_si128(a, b); //!a and b ++ b_1 = _mm_slli_epi32 (b_1,31); ++ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit ++ return _mm_sub_epi32(r,b_1); ++} ++ ++uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0 ++{ ++ __m128i avg; ++ avg = _mm_avg_epu8 (a, b); ++ return _mm_sub_epi8(a, avg); ++} ++ ++uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0 ++{ ++ __m128i avg; ++ avg = _mm_avg_epu16 (a, b); ++ return _mm_sub_epi16(a, avg); ++} ++ ++uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0 ++{ ++ //need to deal with the possibility of internal overflow ++ __m128i a2, b2,r, b_1; ++ a2 = _mm_srli_epi32 (a,1); ++ b2 = _mm_srli_epi32 (b,1); ++ r = _mm_sub_epi32 (a2, b2); ++ b_1 = _mm_andnot_si128(a, b); //!a and b ++ b_1 = _mm_slli_epi32 (b_1,31); ++ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit ++ return _mm_sub_epi32(r,b_1); ++} ++ ++//******* Vector subtract high half (truncated) ** ************ ++//************************************************************ ++int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sum, sum8; ++ sum = _mm_sub_epi16 (a, b); ++ sum8 = _mm_srai_epi16 (sum, 8); ++ sum8 = _mm_packs_epi16(sum8,sum8); ++ return64(sum8); ++} ++ ++int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0 ++{ ++ int16x4_t res64; ++ __m128i sum, sum16; ++ sum = _mm_sub_epi32 (a, b); ++ sum16 = _mm_srai_epi32 (sum, 16); ++ sum16 = _mm_packs_epi32(sum16,sum16); ++ return64(sum16); ++} ++ ++int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b) ++{ ++ int32x2_t res64; ++ __m128i sub; ++ sub = _mm_sub_epi64 (a, b); ++ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); ++ return64(sub); ++} ++ ++uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sum, sum8; ++ sum = _mm_sub_epi16 (a, b); ++ sum8 = _mm_srli_epi16 (sum, 8); ++ sum8 = _mm_packus_epi16(sum8,sum8); ++ return64(sum8); ++} ++ ++uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0 ++{ ++ uint16x4_t res64; ++ __m128i sum, sum16; ++ sum = _mm_sub_epi32 (a, b); ++ sum16 = _mm_srli_epi32 (sum, 16); ++ sum16 = _MM_PACKUS1_EPI32(sum16); ++ return64(sum16); ++} ++ ++uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 ++#define vsubhn_u64 vsubhn_s64 ++ ++//************ Vector rounding subtract high half ********************* ++//********************************************************************* ++int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sub = _mm_srai_epi16 (sub, 8); //get high half ++ sub = _mm_add_epi16 (sub, mask1); //actual rounding ++ sub = _mm_packs_epi16 (sub, sub); ++ return64(sub); ++} ++ ++int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0 ++{ ++ //SIMD may be not optimal, serial may be faster ++ int16x4_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sub = _mm_srai_epi32 (sub, 16); //get high half ++ sub = _mm_add_epi32 (sub, mask1); //actual rounding ++ sub = _mm_packs_epi32 (sub, sub); ++ return64(sub); ++} ++ ++int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b) ++{ ++ //SIMD may be not optimal, serial may be faster ++ int32x2_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi64 (a, b); ++ mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero ++ sub = _mm_add_epi64 (sub, mask1); //actual high half rounding ++ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); ++ return64(sub); ++} ++ ++uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sub = _mm_srai_epi16 (sub, 8); //get high half ++ sub = _mm_add_epi16 (sub, mask1); //actual rounding ++ sub = _mm_packus_epi16 (sub, sub); ++ return64(sub); ++} ++ ++uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0 ++{ ++ //SIMD may be not optimal, serial may be faster ++ uint16x4_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sub = _mm_srai_epi32 (sub, 16); //get high half ++ sub = _mm_add_epi32 (sub, mask1); //actual rounding ++ sub = _MM_PACKUS1_EPI32 (sub); ++ return64(sub); ++} ++ ++uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++#define vrsubhn_u64 vrsubhn_s64 ++ ++//*********** Vector saturating doubling multiply subtract long ******************** ++//************************************************************************************ ++int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) ++{ ++ //not optimal SIMD soulution, serial may be faster ++ __m128i res32, mask; ++ int32x4_t res; ++ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ res = vmull_s16(b, c); ++ res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered ++ mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask); ++ res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000 ++ return vqsubq_s32(a, res32); //saturation ++} ++ ++int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res64, mask; ++ int64x2_t res; ++ _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000}; ++ res = vmull_s32(b, c); ++ res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered ++ mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask); ++ res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000 ++ return vqsubq_s64(a, res64); //saturation ++} ++ ++//****************** COMPARISON *************************************** ++//******************* Vector compare equal ************************************* ++//**************************************************************************** ++uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmpeq_ps(_pM128(a), _pM128(b) ); ++ return64f(res); ++} ++ ++uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 ++#define vceq_p8 vceq_u8 ++ ++ ++uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 ++#define vceqq_s8 _mm_cmpeq_epi8 ++ ++uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 ++#define vceqq_s16 _mm_cmpeq_epi16 ++ ++uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 ++#define vceqq_s32 _mm_cmpeq_epi32 ++ ++uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmpeq_ps(a,b); ++ return _M128i(res); ++} ++ ++uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 ++#define vceqq_u8 _mm_cmpeq_epi8 ++ ++uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 ++#define vceqq_u16 _mm_cmpeq_epi16 ++ ++uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 ++#define vceqq_u32 _mm_cmpeq_epi32 ++ ++uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 ++#define vceqq_p8 _mm_cmpeq_epi8 ++ ++//******************Vector compare greater-than or equal************************* ++//******************************************************************************* ++//in IA SIMD no greater-than-or-equal comparison for integers, ++// there is greater-than available only, so we need the following tricks ++ ++uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vcgeq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vcgeq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vcgeq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries ++ return64f(res); ++} ++ ++uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vcgeq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vcgeq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b) ++{ ++ //serial solution looks faster ++ uint32x2_t res64; ++ return64(vcgeq_u32 (_pM128i(a), _pM128i(b))); ++} ++ ++ ++ ++uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 ++{ ++ __m128i m1, m2; ++ m1 = _mm_cmpgt_epi8 ( a, b); ++ m2 = _mm_cmpeq_epi8 ( a, b); ++ return _mm_or_si128 ( m1, m2); ++} ++ ++uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 ++{ ++ __m128i m1, m2; ++ m1 = _mm_cmpgt_epi16 ( a, b); ++ m2 = _mm_cmpeq_epi16 ( a, b); ++ return _mm_or_si128 ( m1,m2); ++} ++ ++uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 ++{ ++ __m128i m1, m2; ++ m1 = _mm_cmpgt_epi32 (a, b); ++ m2 = _mm_cmpeq_epi32 (a, b); ++ return _mm_or_si128 (m1, m2); ++} ++ ++uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmpge_ps(a,b); //use only 2 first entries ++ return *(__m128i*)&res; ++} ++ ++uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 ++{ ++ //no unsigned chars comparison, only signed available,so need the trick ++ #ifdef USE_SSE4 ++ __m128i cmp; ++ cmp = _mm_max_epu8(a, b); ++ return _mm_cmpeq_epi8(cmp, a); //a>=b ++ #else ++ __m128i c128, as, bs, m1, m2; ++ c128 = _mm_set1_epi8 (128); ++ as = _mm_sub_epi8( a, c128); ++ bs = _mm_sub_epi8( b, c128); ++ m1 = _mm_cmpgt_epi8( as, bs); ++ m2 = _mm_cmpeq_epi8 (as, bs); ++ return _mm_or_si128 ( m1, m2); ++ #endif ++} ++ ++uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 ++{ ++ //no unsigned shorts comparison, only signed available,so need the trick ++ #ifdef USE_SSE4 ++ __m128i cmp; ++ cmp = _mm_max_epu16(a, b); ++ return _mm_cmpeq_epi16(cmp, a); //a>=b ++ #else ++ __m128i c8000, as, bs, m1, m2; ++ c8000 = _mm_set1_epi16 (0x8000); ++ as = _mm_sub_epi16(a,c8000); ++ bs = _mm_sub_epi16(b,c8000); ++ m1 = _mm_cmpgt_epi16(as, bs); ++ m2 = _mm_cmpeq_epi16 (as, bs); ++ return _mm_or_si128 ( m1, m2); ++ #endif ++} ++ ++uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 ++{ ++ //no unsigned ints comparison, only signed available,so need the trick ++ #ifdef USE_SSE4 ++ __m128i cmp; ++ cmp = _mm_max_epu32(a, b); ++ return _mm_cmpeq_epi32(cmp, a); //a>=b ++ #else ++ //serial solution may be faster ++ __m128i c80000000, as, bs, m1, m2; ++ c80000000 = _mm_set1_epi32 (0x80000000); ++ as = _mm_sub_epi32(a,c80000000); ++ bs = _mm_sub_epi32(b,c80000000); ++ m1 = _mm_cmpgt_epi32 (as, bs); ++ m2 = _mm_cmpeq_epi32 (as, bs); ++ return _mm_or_si128 ( m1, m2); ++ #endif ++} ++ ++//**********************Vector compare less-than or equal****************************** ++//*************************************************************************************** ++//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks ++ ++uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vcleq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vcleq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vcleq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0? ++_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmple_ps(_pM128(a),_pM128(b)); ++ return64f(res); ++} ++ ++uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++#define vcle_u8(a,b) vcge_u8(b,a) ++ ++ ++uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 ++#define vcle_u16(a,b) vcge_u16(b,a) ++ ++ ++uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++#define vcle_u32(a,b) vcge_u32(b,a) ++ ++uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 ++{ ++ __m128i c1, res; ++ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... ++ res = _mm_cmpgt_epi8 ( a, b); ++ return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal ++} ++ ++uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 ++{ ++ __m128i c1, res; ++ c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff.... ++ res = _mm_cmpgt_epi16 ( a, b); ++ return _mm_andnot_si128 (res, c1); ++} ++ ++uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 ++{ ++ __m128i c1, res; ++ c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff.... ++ res = _mm_cmpgt_epi32 ( a, b); ++ return _mm_andnot_si128 (res, c1); ++} ++ ++uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmple_ps(a,b); ++ return *(__m128i*)&res; ++} ++ ++uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 ++ { ++ //no unsigned chars comparison in SSE, only signed available,so need the trick ++ __m128i cmp; ++ cmp = _mm_min_epu8(a, b); ++ return _mm_cmpeq_epi8(cmp, a); //a<=b ++ } ++#else ++ #define vcleq_u8(a,b) vcgeq_u8(b,a) ++#endif ++ ++ ++uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 ++ { ++ //no unsigned shorts comparison in SSE, only signed available,so need the trick ++ __m128i cmp; ++ cmp = _mm_min_epu16(a, b); ++ return _mm_cmpeq_epi16(cmp, a); //a<=b ++ } ++#else ++ #define vcleq_u16(a,b) vcgeq_u16(b,a) ++#endif ++ ++ ++uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 ++ { ++ //no unsigned chars comparison in SSE, only signed available,so need the trick ++ __m128i cmp; ++ cmp = _mm_min_epu32(a, b); ++ return _mm_cmpeq_epi32(cmp, a); //a<=b ++ } ++#else ++//solution may be not optimal compared with the serial one ++ #define vcleq_u32(a,b) vcgeq_u32(b,a) ++#endif ++ ++ ++//****** Vector compare greater-than ****************************************** ++//************************************************************************** ++uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries ++ return64f(res); ++} ++ ++uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vcgtq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vcgtq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vcgtq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++#define vcgtq_s8 _mm_cmpgt_epi8 ++ ++uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++#define vcgtq_s16 _mm_cmpgt_epi16 ++ ++uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++#define vcgtq_s32 _mm_cmpgt_epi32 ++ ++uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmpgt_ps(a,b); //use only 2 first entries ++ return *(__m128i*)&res; ++} ++ ++uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0 ++{ ++ //no unsigned chars comparison, only signed available,so need the trick ++ __m128i c128, as, bs; ++ c128 = _mm_set1_epi8 (128); ++ as = _mm_sub_epi8(a,c128); ++ bs = _mm_sub_epi8(b,c128); ++ return _mm_cmpgt_epi8 (as, bs); ++} ++ ++uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0 ++{ ++ //no unsigned short comparison, only signed available,so need the trick ++ __m128i c8000, as, bs; ++ c8000 = _mm_set1_epi16 (0x8000); ++ as = _mm_sub_epi16(a,c8000); ++ bs = _mm_sub_epi16(b,c8000); ++ return _mm_cmpgt_epi16 ( as, bs); ++} ++ ++uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0 ++{ ++ //no unsigned int comparison, only signed available,so need the trick ++ __m128i c80000000, as, bs; ++ c80000000 = _mm_set1_epi32 (0x80000000); ++ as = _mm_sub_epi32(a,c80000000); ++ bs = _mm_sub_epi32(b,c80000000); ++ return _mm_cmpgt_epi32 ( as, bs); ++} ++ ++//********************* Vector compare less-than ************************** ++//************************************************************************* ++uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!! ++ ++ ++uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!! ++ ++ ++uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!! ++ ++ ++uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!! ++ ++uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!! ++ ++uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 ++#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!! ++ ++uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!! ++ ++uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!! ++ ++uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!! ++ ++uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!! ++ ++uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!! ++ ++uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!! ++ ++uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 ++#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!! ++ ++uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!! ++ ++//*****************Vector compare absolute greater-than or equal ************ ++//*************************************************************************** ++uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmpge_ps ( a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmpge_ps ( a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//********Vector compare absolute less-than or equal ****************** ++//******************************************************************** ++uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmple_ps (a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmple_ps (a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//******** Vector compare absolute greater-than ****************** ++//****************************************************************** ++uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmpgt_ps (a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmpgt_ps (a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//***************Vector compare absolute less-than *********************** ++//************************************************************************* ++uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmplt_ps (a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmplt_ps (a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//*************************Vector test bits************************************ ++//***************************************************************************** ++/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them ++with the corresponding element of a second vector. If the result is not zero, the ++corresponding element in the destination vector is set to all ones. Otherwise, it is set to ++all zeros. */ ++ ++uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vtstq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vtstq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vtstq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 ++#define vtst_u8 vtst_s8 ++ ++uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 ++#define vtst_u16 vtst_s16 ++ ++uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 ++#define vtst_u32 vtst_s32 ++ ++ ++uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 ++#define vtst_p8 vtst_u8 ++ ++uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0 ++{ ++ __m128i zero, one, res; ++ zero = _mm_setzero_si128 (); ++ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff ++ res = _mm_and_si128 (a, b); ++ res = _mm_cmpeq_epi8 (res, zero); ++ return _mm_xor_si128(res, one); //invert result ++} ++ ++uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0 ++{ ++ __m128i zero, one, res; ++ zero = _mm_setzero_si128 (); ++ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff ++ res = _mm_and_si128 (a, b); ++ res = _mm_cmpeq_epi16 (res, zero); ++ return _mm_xor_si128(res, one); //invert result ++} ++ ++uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0 ++{ ++ __m128i zero, one, res; ++ zero = _mm_setzero_si128 (); ++ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff ++ res = _mm_and_si128 (a, b); ++ res = _mm_cmpeq_epi32 (res, zero); ++ return _mm_xor_si128(res, one); //invert result ++} ++ ++uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 ++#define vtstq_u8 vtstq_s8 ++ ++uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 ++#define vtstq_u16 vtstq_s16 ++ ++uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 ++#define vtstq_u32 vtstq_s32 ++ ++uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 ++#define vtstq_p8 vtstq_u8 ++ ++//****************** Absolute difference ******************** ++//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |***** ++//************************************************************ ++int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vabdq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vabdq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vabdq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vabdq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vabdq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vabdq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = vabdq_f32(_pM128(a), _pM128(b)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_sub_epi8 (a, b); ++ return _mm_abs_epi8 (res); ++} ++ ++int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_sub_epi16 (a,b); ++ return _mm_abs_epi16 (res); ++} ++ ++int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_sub_epi32 (a,b); ++ return _mm_abs_epi32 (res); ++} ++ ++uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned ++{ ++ __m128i cmp, difab, difba; ++ cmp = vcgtq_u8(a,b); ++ difab = _mm_sub_epi8(a,b); ++ difba = _mm_sub_epi8 (b,a); ++ difab = _mm_and_si128(cmp, difab); ++ difba = _mm_andnot_si128(cmp, difba); ++ return _mm_or_si128(difab, difba); ++} ++ ++uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) ++{ ++ __m128i cmp, difab, difba; ++ cmp = vcgtq_u16(a,b); ++ difab = _mm_sub_epi16(a,b); ++ difba = _mm_sub_epi16 (b,a); ++ difab = _mm_and_si128(cmp, difab); ++ difba = _mm_andnot_si128(cmp, difba); ++ return _mm_or_si128(difab, difba); ++} ++ ++uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) ++{ ++ __m128i cmp, difab, difba; ++ cmp = vcgtq_u32(a,b); ++ difab = _mm_sub_epi32(a,b); ++ difba = _mm_sub_epi32 (b,a); ++ difab = _mm_and_si128(cmp, difab); ++ difba = _mm_andnot_si128(cmp, difba); ++ return _mm_or_si128(difab, difba); ++} ++ ++float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0 ++{ ++ __m128i c1; ++ __m128 res; ++ c1 = _mm_set1_epi32(0x7fffffff); ++ res = _mm_sub_ps (a, b); ++ return _mm_and_ps (res, *(__m128*)&c1); ++} ++ ++//************ Absolute difference - long ************************** ++//******************************************************************** ++int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return vabdq_s16(a16, b16); ++ ++} ++ ++int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, ++ return vabdq_s32(a32, b32); ++} ++ ++int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //no optimal SIMD solution, serial looks faster ++ _NEON2SSE_ALIGN_16 int64_t res[2]; ++ if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0]; ++ else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0]; ++ if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1]; ++ else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) ++{ ++ __m128i res; ++ res = vsubl_u8(a,b); ++ return _mm_abs_epi16(res); ++} ++ ++uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) ++{ ++ __m128i res; ++ res = vsubl_u16(a,b); ++ return _mm_abs_epi32(res); ++} ++ ++uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0]; ++ else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0]; ++ if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1]; ++ else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ************* ++//********************************************************************************************* ++int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) ++{ ++ int8x8_t res64; ++ return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c))); ++} ++ ++int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) ++{ ++ int16x4_t res64; ++ return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c))); ++} ++ ++int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) ++{ ++ int32x2_t res64; ++ return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c))); ++} ++ ++uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 ++#define vaba_u8 vaba_s8 ++ ++ ++uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0 ++#define vaba_u16 vaba_s16 ++ ++uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) ++{ ++ uint32x2_t res64; ++ return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c))); ++} ++ ++int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0 ++{ ++ int8x16_t sub; ++ sub = vabdq_s8(b, c); ++ return vaddq_s8( a, sub); ++} ++ ++int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0 ++{ ++ int16x8_t sub; ++ sub = vabdq_s16(b, c); ++ return vaddq_s16( a, sub); ++} ++ ++int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0 ++{ ++ int32x4_t sub; ++ sub = vabdq_s32(b, c); ++ return vaddq_s32( a, sub); ++} ++ ++uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) ++{ ++ uint8x16_t sub; ++ sub = vabdq_u8(b, c); ++ return vaddq_u8( a, sub); ++} ++ ++uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) ++{ ++ uint16x8_t sub; ++ sub = vabdq_u16(b, c); ++ return vaddq_u16( a, sub); ++} ++ ++uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) ++{ ++ uint32x4_t sub; ++ sub = vabdq_u32(b, c); ++ return vaddq_u32( a, sub); ++} ++ ++//************** Absolute difference and accumulate - long ******************************** ++//************************************************************************************* ++int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0 ++{ ++ __m128i b16, c16, res; ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1, ++ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); ++ return _mm_add_epi16 (a, res); ++} ++ ++int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0 ++{ ++ __m128i b32, c32, res; ++ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1 ++ c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1 ++ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); ++ return _mm_add_epi32 (a, res); ++} ++ ++int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res; ++ res = vabdl_s32(b,c); ++ return _mm_add_epi64(a, res); ++} ++ ++uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) ++{ ++ __m128i b16, c16, res; ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, ++ c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1, ++ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); ++ return _mm_add_epi16 (a, res); ++} ++ ++uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) ++{ ++ __m128i b32, c32, res; ++ b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1 ++ c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1 ++ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); ++ return _mm_add_epi32 (a, res); ++} ++ ++uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res; ++ res = vabdl_u32(b,c); ++ return _mm_add_epi64(a, res); ++} ++ ++//*********************************************************************************** ++//**************** Maximum and minimum operations ********************************** ++//*********************************************************************************** ++//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ******* ++//*********************************************************************************** ++int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_max_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_max_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i res; ++ res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial ++ return64(res); ++} ++ ++float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; ++ res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; ++ return res; ++} ++ ++int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 ++#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1 ++ ++int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 ++#define vmaxq_s16 _mm_max_epi16 ++ ++int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 ++#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1 ++ ++uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 ++#define vmaxq_u8 _mm_max_epu8 ++ ++uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0 ++#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1 ++ ++uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 ++#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1 ++ ++ ++float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 ++#define vmaxq_f32 _mm_max_ps ++ ++//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** ++//*********************************************************************************************************** ++int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_min_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_min_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i res; ++ res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial ++ return64(res); ++} ++ ++float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; ++ res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; ++ return res; ++} ++ ++int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 ++#define vminq_s8 _MM_MIN_EPI8 //SSE4.1 ++ ++int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 ++#define vminq_s16 _mm_min_epi16 ++ ++int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 ++#define vminq_s32 _MM_MIN_EPI32 //SSE4.1 ++ ++uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 ++#define vminq_u8 _mm_min_epu8 ++ ++uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0 ++#define vminq_u16 _MM_MIN_EPU16 //SSE4.1 ++ ++uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 ++#define vminq_u32 _MM_MIN_EPU32 //SSE4.1 ++ ++float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 ++#define vminq_f32 _mm_min_ps ++ ++//************* Pairwise addition operations. ************************************** ++//************************************************************************************ ++//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector ++int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit and then pack ++ int8x8_t res64; ++ __m128i a16, b16, res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 ++ res = _mm_hadd_epi16 (a16, b16); ++ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits ++ return64(res); ++} ++ ++int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ __m128i hadd128; ++ hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b)); ++ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(hadd128); ++} ++ ++ ++int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ __m128i hadd128; ++ hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b)); ++ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(hadd128); ++} ++ ++ ++uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0 ++{ ++ // no 8 bit hadd in IA32, need to go to 16 bit and then pack ++ uint8x8_t res64; ++// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work ++ __m128i mask8, a16, b16, res; ++ mask8 = _mm_set1_epi16(0xff); ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 ++ res = _mm_hadd_epi16 (a16, b16); ++ res = _mm_and_si128(res, mask8); //to avoid saturation ++ res = _mm_packus_epi16 (res,res); //use low 64 bits ++ return64(res); ++} ++ ++uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0 ++{ ++ // solution may be not optimal, serial execution may be faster ++ // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed ++ uint16x4_t res64; ++ __m128i c32767, cfffe, as, bs, res; ++ c32767 = _mm_set1_epi16 (32767); ++ cfffe = _mm_set1_epi16 (0xfffe); ++ as = _mm_sub_epi16 (_pM128i(a), c32767); ++ bs = _mm_sub_epi16 (_pM128i(b), c32767); ++ res = _mm_hadd_epi16 (as, bs); ++ res = _mm_add_epi16 (res, cfffe); ++ res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(res); ++} ++ ++uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster ++{ ++ //hadd doesn't work for unsigned values ++ uint32x2_t res64; ++ __m128i ab, ab_sh, res; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1 ++ ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0 ++ res = _mm_add_epi32(ab, ab_sh); ++ res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(res); ++} ++ ++float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b) ++{ ++ __m128 hadd128; ++ __m64_128 res64; ++ hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b)); ++ hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits ++ _M64f(res64, hadd128); ++ return res64; ++} ++ ++ ++//************************** Long pairwise add ********************************** ++//********************************************************************************* ++//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width, ++// and places the final results in the destination vector. ++ ++int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 ++_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit anyway ++ __m128i a16; ++ int16x4_t res64; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 ++ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits ++ return64(a16); ++} ++ ++int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 ++_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0 ++{ ++ // solution may be not optimal, serial execution may be faster ++ int32x2_t res64; ++ __m128i r32_1; ++ r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a)); ++ r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits ++ return64(r32_1); ++} ++ ++int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1]; ++ return res; ++} ++ ++uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0 ++{ ++ // no 8 bit hadd in IA32, need to go to 16 bit ++// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work ++ uint16x4_t res64; ++ __m128i a16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits ++ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits ++ return64(a16); ++} ++ ++uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than a SIMD one ++ uint32x2_t res; ++ res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1]; ++ res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3]; ++ return res; ++} ++ ++uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1]; ++ return res; ++} ++ ++int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 ++_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit ++ __m128i r16_1, r16_2; ++ r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 ++ //swap hi and low part of r to process the remaining data ++ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ r16_2 = _MM_CVTEPI8_EPI16 (r16_2); ++ return _mm_hadd_epi16 (r16_1, r16_2); ++} ++ ++int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 ++_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit ++ __m128i r32_1, r32_2; ++ r32_1 = _MM_CVTEPI16_EPI32(a); ++ //swap hi and low part of r to process the remaining data ++ r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ r32_2 = _MM_CVTEPI16_EPI32 (r32_2); ++ return _mm_hadd_epi32 (r32_1, r32_2); ++} ++ ++int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0 ++{ ++ _NEON2SSE_ALIGN_16 int32_t atmp[4]; ++ _NEON2SSE_ALIGN_16 int64_t res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ res[0] = (int64_t)atmp[0] + (int64_t)atmp[1]; ++ res[1] = (int64_t)atmp[2] + (int64_t)atmp[3]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 ++_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit ++ __m128i r16_1, r16_2; ++ r16_1 = _MM_CVTEPU8_EPI16(a); ++ //swap hi and low part of r to process the remaining data ++ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ r16_2 = _MM_CVTEPU8_EPI16 (r16_2); ++ return _mm_hadd_epi16 (r16_1, r16_2); ++} ++ ++uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than a SIMD one ++ _NEON2SSE_ALIGN_16 uint16_t atmp[8]; ++ _NEON2SSE_ALIGN_16 uint32_t res[4]; ++ _mm_store_si128((__m128i*)atmp, a); ++ res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1]; ++ res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3]; ++ res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5]; ++ res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1]; ++ res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//************************ Long pairwise add and accumulate ************************** ++//**************************************************************************************** ++//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector, ++// and accumulates the values of the results into the elements of the destination (wide) vector ++int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 ++_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) ++{ ++ int16x4_t res64; ++ return64(vpadalq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 ++_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) ++{ ++ int32x2_t res64; ++ return64(vpadalq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 ++_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0]; ++ return res; ++} ++ ++uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) ++{ ++ uint16x4_t res64; ++ return64(vpadalq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0 ++_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) ++{ ++ uint32x2_t res64; ++ return64(vpadalq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 ++_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0]; ++ return res; ++} ++ ++int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 ++_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0 ++{ ++ int16x8_t pad; ++ pad = vpaddlq_s8(b); ++ return _mm_add_epi16 (a, pad); ++} ++ ++int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 ++_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0 ++{ ++ int32x4_t pad; ++ pad = vpaddlq_s16(b); ++ return _mm_add_epi32(a, pad); ++} ++ ++int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 ++_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) ++{ ++ int64x2_t pad; ++ pad = vpaddlq_s32(b); ++ return _mm_add_epi64 (a, pad); ++} ++ ++uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 ++_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0 ++{ ++ uint16x8_t pad; ++ pad = vpaddlq_u8(b); ++ return _mm_add_epi16 (a, pad); ++} ++ ++uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x4_t pad; ++ pad = vpaddlq_u16(b); ++ return _mm_add_epi32(a, pad); ++} //no optimal SIMD solution, serial is faster ++ ++uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //no optimal SIMD solution, serial is faster ++ uint64x2_t pad; ++ pad = vpaddlq_u32(b); ++ return _mm_add_epi64(a, pad); ++} //no optimal SIMD solution, serial is faster ++ ++//********** Folding maximum ************************************* ++//******************************************************************* ++//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors, ++//and copies the larger of each pair into the corresponding element in the destination ++// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison ++int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0 ++{ ++ int8x8_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding ++ max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1 ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(max); //we need 64 bits only ++} ++ ++int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ int16x4_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask ++ max = _mm_max_epi16 (ab, ab1); ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(max); ++} ++ ++int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ int32x2_t res; ++ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; ++ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; ++ return res; ++} ++ ++uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0 ++{ ++ uint8x8_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding ++ max = _mm_max_epu8 (ab, ab1); // SSE4.1 ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(max); ++} ++ ++uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ uint16x4_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask ++ max = _MM_MAX_EPU16 (ab, ab1); ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(max); ++} ++ ++uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ uint32x2_t res; ++ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; ++ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; ++ return res; ++} //serial solution looks faster than a SIMD one ++ ++float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; ++ res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; ++ return res; ++} ++ ++// ***************** Folding minimum **************************** ++// ************************************************************** ++//vpmin -> takes minimum of adjacent pairs ++int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0 ++{ ++ int8x8_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding ++ min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1 ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(min); ++} ++ ++int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ int16x4_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask ++ min = _mm_min_epi16 (ab, ab1); ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(min); ++} ++ ++int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ int32x2_t res; ++ res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; ++ res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; ++ return res; ++} ++ ++uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0 ++{ ++ uint8x8_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding ++ min = _mm_min_epu8 (ab, ab1); // SSE4.1 ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(min); ++} ++ ++uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ uint16x4_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask ++ min = _MM_MIN_EPU16 (ab, ab1); ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(min); ++} ++ ++uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ uint32x2_t res; ++ res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0]; ++ res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0]; ++ return res; ++} ++ ++float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; ++ res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; ++ return res; ++} ++ ++//*************************************************************** ++//*********** Reciprocal/Sqrt ************************************ ++//*************************************************************** ++//****************** Reciprocal estimate ******************************* ++//the ARM NEON and x86 SIMD results may be slightly different ++float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 ++_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = _mm_rcp_ps(_pM128(a)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! No reciprocal for ints in IA32 available ++ uint32x2_t res; ++ float resf, r; ++ int i, q, s; ++ for (i =0; i<2; i++){ ++ if((a.m64_u32[i] & 0x80000000) == 0) { ++ res.m64_u32[i] = 0xffffffff; ++ }else{ ++ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); ++ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ ++ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res.m64_u32[i] = r * (uint32_t)(1 << 31); ++ } ++ } ++ return res; ++} ++ ++float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 ++#define vrecpeq_f32 _mm_rcp_ps ++ ++ ++uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! ++ //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double ++ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; ++ _NEON2SSE_ALIGN_16 uint32_t res[4]; ++ _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000}; ++ float resf, r; ++ int i, q, s; ++ __m128i res128, mask, zero; ++ _mm_store_si128((__m128i*)atmp, a); ++ zero = _mm_setzero_si128(); ++ for (i =0; i<4; i++){ ++ resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31)) ++ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ ++ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); ++ } ++ res128 = _mm_load_si128((__m128i*)res); ++ mask = _mm_and_si128(a, *(__m128i*)c80000000); ++ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff ++ return _mm_or_si128(res128, mask); ++} ++ ++//**********Reciprocal square root estimate **************** ++//********************************************************** ++//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster ++//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers ++////the ARM NEON and x86 SIMD results may be slightly different ++float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 ++_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = _mm_rsqrt_ps(_pM128(a)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! ++ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double ++ uint32x2_t res; ++ __m128 tmp; ++ float r, resf, coeff; ++ int i,q0, q1, s;; ++ for (i =0; i<2; i++){ ++ if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff ++ res.m64_u32[i] = 0xffffffff; ++ }else{ ++ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); ++ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ ++ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ ++ r = ((float)q0 + 0.5) / coeff; ++ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ ++ _mm_store_ss(&r, tmp); ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res.m64_u32[i] = r * (((uint32_t)1) << 31); ++ } ++ } ++ return res; ++} ++ ++float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 ++#define vrsqrteq_f32 _mm_rsqrt_ps ++ ++uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! ++ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double ++ _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4]; ++ _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)}; ++ _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000}; ++ __m128 tmp; ++ __m128i res128, mask, zero; ++ float r, resf, coeff; ++ int i,q0, q1, s; ++ _mm_store_si128((__m128i*)atmp, a); ++ zero = _mm_setzero_si128(); ++ for (i =0; i<4; i++){ ++ resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31))); ++ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ ++ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ ++ r = ((float)q0 + 0.5) / coeff; ++ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ ++ _mm_store_ss(&r, tmp); ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); ++ } ++ res128 = _mm_load_si128((__m128i*)res); ++ mask = _mm_and_si128(a, *(__m128i*)c_c0000000); ++ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff ++ return _mm_or_si128(res128, mask); ++} ++//************ Reciprocal estimate/step and 1/sqrt estimate/step *************************** ++//****************************************************************************************** ++//******VRECPS (Vector Reciprocal Step) *************************************************** ++//multiplies the elements of one vector by the corresponding elements of another vector, ++//subtracts each of the results from 2, and places the final results into the elements of the destination vector. ++ ++float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 ++_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = vrecpsq_f32(_pM128(a), _pM128(b)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 ++_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0 ++{ ++ __m128 f2, mul; ++ f2 = _mm_set1_ps(2.); ++ mul = _mm_mul_ps(a,b); ++ return _mm_sub_ps(f2,mul); ++} ++ ++//*****************VRSQRTS (Vector Reciprocal Square Root Step) ***************************** ++//multiplies the elements of one vector by the corresponding elements of another vector, ++//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector. ++ ++float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 ++_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2; ++ res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2; ++ return res; ++} ++ ++float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 ++_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0 ++{ ++ __m128 f3, f05, mul; ++ f3 = _mm_set1_ps(3.); ++ f05 = _mm_set1_ps(0.5); ++ mul = _mm_mul_ps(a,b); ++ f3 = _mm_sub_ps(f3,mul); ++ return _mm_mul_ps (f3, f05); ++} ++//******************************************************************************************** ++//***************************** Shifts by signed variable *********************************** ++//******************************************************************************************** ++//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) *********************** ++//******************************************************************************************** ++//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution ++//helper macro. It matches ARM implementation for big shifts ++#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \ ++ else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \ ++ for (i = 0; i= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \ ++ else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \ ++ return res; ++ ++int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(8, i, 8) ++} ++ ++int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(16, i, 4) ++} ++ ++int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(32, i, 2) ++} ++ ++int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(64, i, 1) ++} ++ ++uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(8, u, 8) ++} ++ ++uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(16, u, 4) ++} ++ ++uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(32, u, 2) ++} ++ ++uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers ++{ ++ SERIAL_SHIFT_64(64, u, 1) ++} ++ ++int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int8_t, int8_t, 16, 16) ++} ++ ++int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int16_t, int16_t, 8, 8) ++} ++ ++int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int32_t, int32_t, 4, 4) ++} ++ ++int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int64_t, int64_t, 2, 2) ++} ++ ++uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint8_t, int8_t, 16, 16) ++} ++ ++uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint16_t, int16_t, 8, 8) ++} ++ ++uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint32_t, int32_t, 4, 4) ++} ++ ++uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint64_t, int64_t, 2, 2) ++} ++ ++ ++//*********** Vector saturating shift left: (negative values shift right) ********************** ++//******************************************************************************************** ++//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution ++#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ ++ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i]); \ ++ else{ \ ++ if (btmp[i]>lanesize_1) { \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ ++ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ else res[i] = atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ ++ TYPE lanesize = (sizeof(TYPE) << 3); \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i]); \ ++ else{ \ ++ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ ++ else{ \ ++ limit = (TYPE) 1 << (lanesize - btmp[i]); \ ++ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \ ++ int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize_1) { \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ ++ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ ++ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ ++ return res; ++ ++#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ ++ int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ ++ else{ \ ++ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ ++ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \ ++ return res; ++ ++int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(8,8) ++} ++ ++int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(16,4) ++} ++ ++int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(32,2) ++} ++ ++int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(64,1) ++} ++ ++uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8) ++} ++ ++uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4) ++} ++ ++uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2) ++} ++ ++uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1) ++} ++ ++int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16) ++} ++ ++int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8) ++} ++ ++int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4) ++} ++ ++int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2) ++} ++ ++uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16) ++} ++ ++uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8) ++} ++ ++uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4) ++} ++ ++uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2) ++} ++ ++ ++//******** Vector rounding shift left: (negative values shift right) ********** ++//**************************************************************************** ++//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution ++//rounding makes sense for right shifts only. ++#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i= 0) { \ ++ if(btmp[i] >= lanesize) res[i] = 0; \ ++ else res[i] = (atmp[i] << btmp[i]); \ ++ }else{ \ ++ res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \ ++ (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \ ++ (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \ ++ return _mm_load_si128((__m128i*)res); ++ ++ ++#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \ ++ for (i = 0; i= 0) { \ ++ if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \ ++ else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \ ++ }else{ \ ++ res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \ ++ (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \ ++ (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \ ++ return res; ++ ++ ++int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(8,i,8) ++} ++ ++int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(16,i,4) ++} ++ ++int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(32,i,2) ++} ++ ++int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(64,i,1) ++} ++ ++uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(8,u,8) ++} ++ ++uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(16,u,4) ++} ++ ++uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(32,u,2) ++} ++ ++uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(64,u,1) ++} ++ ++int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16) ++} ++ ++int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8) ++} ++ ++int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4) ++} ++ ++int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2) ++} ++ ++uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16) ++} ++ ++uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8) ++} ++ ++uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4) ++} ++ ++uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2) ++} ++ ++ ++//********** Vector saturating rounding shift left: (negative values shift right) **************** ++//************************************************************************************************* ++//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution ++//Saturation happens for left shifts only while rounding makes sense for right shifts only. ++#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ ++ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ ++ else{ \ ++ if (btmp[i]>lanesize_1) { \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ ++ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ else res[i] = atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ ++ int lanesize = (sizeof(TYPE) << 3); \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ ++ else{ \ ++ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ ++ else{ \ ++ limit = (TYPE) 1 << (lanesize - btmp[i]); \ ++ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \ ++ __m64_128 res; int ## TYPE ## _t limit; int i; \ ++ int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize_1) { \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ ++ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ ++ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ ++ return res; ++ ++#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \ ++ __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ ++ int lanesize = (sizeof(int ## TYPE ## _t) << 3); \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ ++ else{ \ ++ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ ++ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ ++ return res; ++ ++int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8) ++} ++ ++int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4) ++} ++ ++int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2) ++} ++ ++int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1) ++} ++ ++uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8) ++} ++ ++uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4) ++} ++ ++uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2) ++} ++ ++uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1) ++} ++ ++int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16) ++} ++ ++int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8) ++} ++ ++int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4) ++} ++ ++int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2) ++} ++ ++uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16) ++} ++ ++uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8) ++} ++ ++uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4) ++} ++ ++uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2) ++} ++ ++// ********************************************************************************* ++// ***************************** Shifts by a constant ***************************** ++// ********************************************************************************* ++//**************** Vector shift right by constant************************************* ++//************************************************************************************ ++int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit ++ int8x8_t res64; ++ __m128i r; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_srai_epi16 (r, b); //SSE2 ++ r = _mm_packs_epi16 (r,r); //we need 64 bits only ++ return64(r); ++} ++ ++int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b) ++{ ++ int16x4_t res64; ++ return64(_mm_srai_epi16(_pM128i(a), b)); ++} ++ ++ ++int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ return64(_mm_srai_epi32(_pM128i(a), b)); ++} ++ ++int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //no arithmetic shift for 64bit values, serial solution used ++ int64x1_t res; ++ if(b>=64) res.m64_i64[0] = 0; ++ else res.m64_i64[0] = (*(int64_t*)&a) >> b; ++ return res; ++} ++ ++uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit ++ uint8x8_t res64; ++ __m128i r; ++ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one ++ r = _mm_packus_epi16 (r,r); //we need 64 bits only ++ return64(r); ++} ++ ++uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b) ++{ ++ uint16x4_t res64; ++ return64(_mm_srli_epi16(_pM128i(a), b)); ++} ++ ++ ++uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res64; ++ return64(_mm_srli_epi32(_pM128i(a), b)); ++} ++ ++ ++uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 ++_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b) ++{ ++ uint64x1_t res64; ++ return64(_mm_srli_epi64(_pM128i(a), b)); ++} ++ ++ ++int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit trick ++ __m128i zero, mask0, a_sign, r, a_sign_mask; ++ _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff}; ++ zero = _mm_setzero_si128(); ++ mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift ++ a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0 ++ r = _mm_srai_epi16 (a, b); ++ a_sign_mask = _mm_and_si128 (mask0, a_sign); ++ r = _mm_andnot_si128 (mask0, r); ++ return _mm_or_si128 (r, a_sign_mask); ++} ++ ++int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 ++#define vshrq_n_s16 _mm_srai_epi16 ++ ++int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 ++#define vshrq_n_s32 _mm_srai_epi32 ++ ++int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b) ++{ ++ //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD ++ __m128i c1, signmask,a0, res64; ++ _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000}; ++ c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff ++ signmask = _mm_slli_epi64 (c1, (64 - b)); ++ a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit ++ a0 = _MM_CMPEQ_EPI64 (a, a0); ++ signmask = _mm_and_si128(a0, signmask); ++ res64 = _mm_srli_epi64 (a, b); ++ return _mm_or_si128(res64, signmask); ++} ++ ++uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8 ++{ ++ //no 8 bit shift available, need the special trick ++ __m128i mask0, r; ++ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00}; ++ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift ++ r = _mm_srli_epi16 ( a, b); ++ return _mm_and_si128 (r, mask0); ++} ++ ++uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16 ++#define vshrq_n_u16 _mm_srli_epi16 ++ ++uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 ++#define vshrq_n_u32 _mm_srli_epi32 ++ ++uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 ++#define vshrq_n_u64 _mm_srli_epi64 ++ ++//*************************** Vector shift left by constant ************************* ++//********************************************************************************* ++int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0 ++{ ++ //no 8 bit shift available, go to 16 bit ++ int8x8_t res64; ++ __m128i r; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_slli_epi16 (r, b); //SSE2 ++ r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only ++ return64(r); ++} ++ ++int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b) ++{ ++ int16x4_t res64; ++ return64(_mm_slli_epi16(_pM128i(a), b)); ++} ++ ++ ++int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b) ++{ ++ int32x2_t res64; ++ return64(_mm_slli_epi32(_pM128i(a), b)); ++} ++ ++ ++int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b) ++{ ++ int64x1_t res64; ++ return64(_mm_slli_epi64(_pM128i(a), b)); ++} ++ ++ ++uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b) ++{ ++ //no 8 bit shift available, go to 16 bit ++ uint8x8_t res64; ++ __m128i mask8; ++ __m128i r; ++ mask8 = _mm_set1_epi16(0xff); ++ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_slli_epi16 (r, b); //SSE2 ++ r = _mm_and_si128(r, mask8); //to avoid saturation ++ r = _mm_packus_epi16 (r,r); //we need 64 bits only ++ return64(r); ++} ++ ++uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++#define vshl_n_u16 vshl_n_s16 ++ ++ ++uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++#define vshl_n_u32 vshl_n_s32 ++ ++uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++#define vshl_n_u64 vshl_n_s64 ++ ++int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++#define vshlq_n_s8 vshlq_n_u8 ++ ++int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++#define vshlq_n_s16 _mm_slli_epi16 ++ ++int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++#define vshlq_n_s32 _mm_slli_epi32 ++ ++int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++#define vshlq_n_s64 _mm_slli_epi64 ++ ++uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) ++{ ++ //no 8 bit shift available, need the special trick ++ __m128i mask0, r; ++ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff}; ++ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift ++ r = _mm_slli_epi16 ( a, b); ++ return _mm_and_si128 (r, mask0); ++} ++ ++uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++#define vshlq_n_u16 vshlq_n_s16 ++ ++uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++#define vshlq_n_u32 vshlq_n_s32 ++ ++uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++#define vshlq_n_u64 vshlq_n_s64 ++ ++//************* Vector rounding shift right by constant ****************** ++//************************************************************************* ++//No corresponding x86 intrinsics exist, need to do some tricks ++int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit ++ int8x8_t res64; ++ __m128i r, maskb; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 ++ r = _mm_srai_epi16 (r, b); ++ r = _mm_add_epi16 (r, maskb); //actual rounding ++ r = _mm_packs_epi16 (r,r); ////we need 64 bits only ++ return64(r); ++} ++ ++int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b) ++{ ++ int16x4_t res64; ++ return64(vrshrq_n_s16(_pM128i(a), b)); ++} ++ ++ ++int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ return64(vrshrq_n_s32(_pM128i(a), b)); ++} ++ ++ ++int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution is faster ++ int64x1_t res; ++ int64_t a_i64 = *( int64_t*)&a; ++ if(b==64) { ++ res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63; ++ } else { ++ int64_t maskb = a_i64 & (( int64_t)1 << (b - 1)); ++ res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1)); ++ } ++ return res; ++} ++ ++uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one ++ uint8x8_t res64; ++ __m128i r, maskb; ++ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 ++ r = _mm_srli_epi16 (r, b); ++ r = _mm_add_epi16 (r, maskb); //actual rounding ++ r = _mm_packus_epi16 (r,r); ////we need 64 bits only ++ return64(r); ++} ++ ++uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b) ++{ ++ uint16x4_t res64; ++ return64(vrshrq_n_u16(_pM128i(a), b)); ++} ++ ++ ++uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res64; ++ return64(vrshrq_n_u32(_pM128i(a), b)); ++} ++ ++ ++uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 ++_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b) ++{ ++ uint64x1_t res64; ++ return64(vrshrq_n_u64(_pM128i(a), b)); ++} ++ ++int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit trick ++ __m128i r, mask1, maskb; ++ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 ++ r = vshrq_n_s8 (a, b); ++ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding ++ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding ++ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 ++ return _mm_add_epi8(r, maskb); //actual rounding ++} ++ ++int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 ++ r = _mm_srai_epi16 (a, b); ++ return _mm_add_epi16 (r, maskb); //actual rounding ++} ++ ++int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 ++ r = _mm_srai_epi32(a, b); ++ return _mm_add_epi32 (r, maskb); //actual rounding ++} ++ ++int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b) ++{ ++ //solution may be not optimal compared with a serial one ++ __m128i maskb; ++ int64x2_t r; ++ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 ++ r = vshrq_n_s64(a, b); ++ return _mm_add_epi64 (r, maskb); //actual rounding ++} ++ ++uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit trick ++ __m128i r, mask1, maskb; ++ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 ++ r = vshrq_n_u8 (a, b); ++ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding ++ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding ++ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 ++ return _mm_add_epi8(r, maskb); //actual rounding ++} ++ ++uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16 ++_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 ++ r = _mm_srli_epi16 (a, b); ++ return _mm_add_epi16 (r, maskb); //actual rounding ++} ++ ++uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 ++_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 ++ r = _mm_srli_epi32(a, b); ++ return _mm_add_epi32 (r, maskb); //actual rounding ++} ++ ++uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 ++_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b) ++{ ++ //solution may be not optimal compared with a serial one ++ __m128i maskb, r; ++ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 ++ r = _mm_srli_epi64(a, b); ++ return _mm_add_epi64 (r, maskb); //actual rounding ++} ++ ++//************* Vector shift right by constant and accumulate ********* ++//********************************************************************* ++int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8 ++{ ++ int8x8_t shift; ++ shift = vshr_n_s8(b, c); ++ return vadd_s8( a, shift); ++} ++ ++int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16 ++{ ++ int16x4_t shift; ++ shift = vshr_n_s16( b, c); ++ return vadd_s16(a, shift); ++} ++ ++int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ int32x2_t shift; ++ shift = vshr_n_s32(b, c); ++ return vadd_s32( a, shift); ++} ++ ++int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 ++_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) ++{ ++ //may be not optimal compared with a serial solution ++ int64x1_t shift; ++ shift = vshr_n_s64(b, c); ++ return vadd_s64( a, shift); ++} ++ ++uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8 ++{ ++ uint8x8_t shift; ++ shift = vshr_n_u8(b, c); ++ return vadd_u8(a, shift); ++} ++ ++uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16 ++{ ++ uint16x4_t shift; ++ shift = vshr_n_u16(b, c); ++ return vadd_u16(a,shift); ++} ++ ++uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ uint32x2_t shift; ++ shift = vshr_n_u32(b, c); ++ return vadd_u32( a, shift); ++} ++ ++uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 ++_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64 ++{ ++ //may be not optimal compared with the serial execution ++ uint64x1_t shift; ++ shift = vshr_n_u64(b, c); ++ return vadd_u64(a, shift); ++} ++ ++int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8 ++{ ++ int8x16_t shift; ++ shift = vshrq_n_s8(b, c); ++ return vaddq_s8(a, shift); ++} ++ ++int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16 ++{ ++ int16x8_t shift; ++ shift = vshrq_n_s16(b, c); ++ return vaddq_s16(a, shift); ++} ++ ++int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32 ++{ ++ int32x4_t shift; ++ shift = vshrq_n_s32(b, c); ++ return vaddq_s32(a, shift); ++} ++ ++int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64 ++{ ++ int64x2_t shift; ++ shift = vshrq_n_s64(b, c); ++ return vaddq_s64( a, shift); ++} ++ ++uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8 ++{ ++ uint8x16_t shift; ++ shift = vshrq_n_u8(b, c); ++ return vaddq_u8(a, shift); ++} ++ ++uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16 ++_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16 ++{ ++ uint16x8_t shift; ++ shift = vshrq_n_u16(b, c); ++ return vaddq_u16(a, shift); ++} ++ ++uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 ++_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32 ++{ ++ uint32x4_t shift; ++ shift = vshrq_n_u32(b, c); ++ return vaddq_u32(a, shift); ++} ++ ++uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 ++_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64 ++{ ++ uint64x2_t shift; ++ shift = vshrq_n_u64(b, c); ++ return vaddq_u64(a, shift); ++} ++ ++//************* Vector rounding shift right by constant and accumulate **************************** ++//************************************************************************************************ ++int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8 ++{ ++ int8x8_t shift; ++ shift = vrshr_n_s8(b, c); ++ return vadd_s8( a, shift); ++} ++ ++int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16 ++{ ++ int16x4_t shift; ++ shift = vrshr_n_s16( b, c); ++ return vadd_s16(a, shift); ++} ++ ++int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ int32x2_t shift; ++ shift = vrshr_n_s32(b, c); ++ return vadd_s32( a, shift); ++} ++ ++int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution ++{ ++ int64x1_t shift; ++ shift = vrshr_n_s64(b, c); ++ return vadd_s64( a, shift); ++} ++ ++uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8 ++{ ++ uint8x8_t shift; ++ shift = vrshr_n_u8(b, c); ++ return vadd_u8(a, shift); ++} ++ ++uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16 ++{ ++ uint16x4_t shift; ++ shift = vrshr_n_u16(b, c); ++ return vadd_u16(a,shift); ++} ++ ++uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ uint32x2_t shift; ++ shift = vrshr_n_u32(b, c); ++ return vadd_u32( a, shift); ++} ++ ++uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution ++{ ++ //may be not optimal compared with the serial execution ++ uint64x1_t shift; ++ shift = vrshr_n_u64(b, c); ++ return vadd_u64( a, shift); ++} ++ ++int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8 ++{ ++ int8x16_t shift; ++ shift = vrshrq_n_s8(b, c); ++ return vaddq_s8(a, shift); ++} ++ ++int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16 ++{ ++ int16x8_t shift; ++ shift = vrshrq_n_s16(b, c); ++ return vaddq_s16(a, shift); ++} ++ ++int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32 ++{ ++ int32x4_t shift; ++ shift = vrshrq_n_s32(b, c); ++ return vaddq_s32(a, shift); ++} ++ ++int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) ++{ ++ int64x2_t shift; ++ shift = vrshrq_n_s64(b, c); ++ return vaddq_s64(a, shift); ++} ++ ++uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8 ++{ ++ uint8x16_t shift; ++ shift = vrshrq_n_u8(b, c); ++ return vaddq_u8(a, shift); ++} ++ ++uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16 ++_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16 ++{ ++ uint16x8_t shift; ++ shift = vrshrq_n_u16(b, c); ++ return vaddq_u16(a, shift); ++} ++ ++uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 ++_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32 ++{ ++ uint32x4_t shift; ++ shift = vrshrq_n_u32(b, c); ++ return vaddq_u32(a, shift); ++} ++ ++uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 ++_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) ++{ ++ uint64x2_t shift; ++ shift = vrshrq_n_u64(b, c); ++ return vaddq_u64(a, shift); ++} ++ ++//**********************Vector saturating shift left by constant ***************************** ++//******************************************************************************************** ++//we don't check const ranges assuming they are met ++int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 ++_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0 ++{ ++ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) ++ int8x8_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi16 (a128, b); ++ r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only ++ return64(r128); ++} ++ ++int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 ++_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0 ++{ ++ // go to 32 bit to get the auto saturation (in packs function) ++ int16x4_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi32 (a128, b); //shift_res ++ r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only ++ return64(r128); ++} ++ ++int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b) ++{ ++ //serial execution may be faster ++ int32x2_t res64; ++ return64(vqshlq_n_s32 (_pM128i(a), b)); ++} ++ ++ ++int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ int64x1_t res; ++ int64_t bmask; ++ int64_t a_i64 = *( int64_t*)&a; ++ bmask = ( int64_t)1 << (63 - b); //positive ++ if (a_i64 >= bmask) { ++ res.m64_i64[0] = ~(_SIGNBIT64); ++ } else { ++ res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b; ++ } ++ return res; ++} ++ ++ ++uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 ++_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0 ++{ ++ //no 8 bit shift available in IA32 SIMD, go to 16 bit ++ uint8x8_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi16 (a128, b); //shift_res ++ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only ++ return64(r128); ++} ++ ++uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0 ++_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0 ++{ ++ // go to 32 bit to get the auto saturation (in packus function) ++ uint16x4_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi32 (a128, b); //shift_res ++ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16 ++ return64(r128); ++} ++ ++uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 ++_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b) ++{ ++ uint32x2_t res64; ++ return64(vqshlq_n_u32(_pM128i(a), b)); ++} ++ ++uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ uint64x1_t res; ++ uint64_t bmask; ++ uint64_t a_i64 = *(uint64_t*)&a; ++ bmask = ( uint64_t)1 << (64 - b); ++ res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a ++ return res; ++} ++ ++int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 ++_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0 ++{ ++ // go to 16 bit to get the auto saturation (in packs function) ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi16 (a128, b); ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI8_EPI16 (a128); ++ r128_2 = _mm_slli_epi16 (a128, b); ++ return _mm_packs_epi16 (r128_1, r128_2); //saturated s8 ++} ++ ++int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 ++_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0 ++{ ++ // manual saturation solution looks LESS optimal than 32 bits conversion one ++ // go to 32 bit to get the auto saturation (in packs function) ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi32 (a128, b); //shift_res ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI16_EPI32 (a128); ++ r128_2 = _mm_slli_epi32 (a128, b); ++ return _mm_packs_epi32 (r128_1, r128_2); //saturated s16 ++} ++ ++int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 ++_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0 ++{ ++ // no 64 bit saturation option available, special tricks necessary ++ __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask; ++ c1 = _mm_cmpeq_epi32(a,a); //0xff..ff ++ maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones ++ saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise ++ c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not ++ shift_res = _mm_slli_epi32 (a, b); ++ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); ++ //result with positive numbers saturated ++ shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask); ++ //treat negative numbers ++ maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros ++ saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise ++ c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not ++ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); ++ return _mm_or_si128 (c7ffffff_mask, shift_res_mask); ++} ++ ++int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2]; ++ int64_t bmask; ++ int i; ++ bmask = ( int64_t)1 << (63 - b); //positive ++ _mm_store_si128((__m128i*)atmp, a); ++ for (i = 0; i<2; i++) { ++ if (atmp[i] >= bmask) { ++ res[i] = ~(_SIGNBIT64); ++ } else { ++ res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b; ++ } ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 ++_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0 ++{ ++ // go to 16 bit to get the auto saturation (in packs function) ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi16 (a128, b); ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPU8_EPI16 (a128); ++ r128_2 = _mm_slli_epi16 (a128, b); ++ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 ++} ++ ++uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0 ++_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0 ++{ ++ // manual saturation solution looks more optimal than 32 bits conversion one ++ __m128i cb, c8000, a_signed, saturation_mask, shift_res; ++ cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 ); ++ c8000 = _mm_set1_epi16 (0x8000); ++//no unsigned shorts comparison in SSE, only signed available, so need the trick ++ a_signed = _mm_sub_epi16(a, c8000); //go to signed ++ saturation_mask = _mm_cmpgt_epi16 (a_signed, cb); ++ shift_res = _mm_slli_epi16 (a, b); ++ return _mm_or_si128 (shift_res, saturation_mask); ++} ++ ++uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 ++_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0 ++{ ++ // manual saturation solution, no 64 bit saturation option, the serial version may be faster ++ __m128i cb, c80000000, a_signed, saturation_mask, shift_res; ++ cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 ); ++ c80000000 = _mm_set1_epi32 (0x80000000); ++//no unsigned ints comparison in SSE, only signed available, so need the trick ++ a_signed = _mm_sub_epi32(a, c80000000); //go to signed ++ saturation_mask = _mm_cmpgt_epi32 (a_signed, cb); ++ shift_res = _mm_slli_epi32 (a, b); ++ return _mm_or_si128 (shift_res, saturation_mask); ++} ++ ++uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2]; ++ uint64_t bmask; ++ int i; ++ bmask = ( uint64_t)1 << (64 - b); ++ _mm_store_si128((__m128i*)atmp, a); ++ for (i = 0; i<2; i++) { ++ res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//**************Vector signed->unsigned saturating shift left by constant ************* ++//************************************************************************************* ++uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 ++_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0 ++{ ++ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) ++ uint8x8_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi16 (a128, b); ++ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only ++ return64(r128); ++} ++ ++uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 ++_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0 ++{ ++ uint16x4_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi32 (a128, b); //shift_res ++ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only ++ return64(r128); ++} ++ ++uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b) ++{ ++ int32x2_t res64; ++ return64( vqshluq_n_s32(_pM128i(a), b)); ++} ++ ++uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster ++{ ++ uint64x1_t res; ++ uint64_t limit; ++ if (a.m64_i64[0]<=0) { ++ res.m64_u64[0] = 0; ++ } else { ++ limit = (uint64_t) 1 << (64 - b); ++ res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b; ++ } ++ return res; ++} ++ ++uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 ++_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0 ++{ ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi16 (a128, b); ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI8_EPI16 (a128); ++ r128_2 = _mm_slli_epi16 (a128, b); ++ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 ++} ++ ++uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 ++_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0 ++{ ++ // manual saturation solution looks LESS optimal than 32 bits conversion one ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi32 (a128, b); //shift_res ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI16_EPI32 (a128); ++ r128_2 = _mm_slli_epi32 (a128, b); ++ return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16 ++} ++ ++uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 ++_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0 ++{ ++ //solution may be not optimal compared with the serial one ++ __m128i zero, maskA, maskGT0, a0, a_masked, a_shift; ++ zero = _mm_setzero_si128(); ++ maskA = _mm_cmpeq_epi32(a, a); ++ maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros ++ //saturate negative numbers to zero ++ maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers) ++ a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now ++ //saturate positive to 0xffffffff ++ a_masked = _mm_and_si128 (a0, maskA); ++ a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise ++ a_shift = _mm_slli_epi32 (a0, b); ++ return _mm_or_si128 (a_shift, a_masked); //actual saturation ++} ++ ++uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here, serial execution looks faster ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ uint64_t limit; ++ int i; ++ _mm_store_si128((__m128i*)atmp, a); ++ for (i = 0; i<2; i++) { ++ if (atmp[i]<=0) { ++ res[i] = 0; ++ } else { ++ limit = (uint64_t) 1 << (64 - b); ++ res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b; ++ } ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//************** Vector narrowing shift right by constant ************** ++//********************************************************************** ++int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r16 = vshrq_n_s16(a,b); ++ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r16); ++} ++ ++int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r32 = vshrq_n_s32(a,b); ++ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r32); ++} ++ ++int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ __m128i r64; ++ r64 = vshrq_n_s64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 ++{ ++ uint8x8_t res64; ++ __m128i mask, r16; ++ mask = _mm_set1_epi16(0xff); ++ r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_and_si128(r16, mask); //to avoid saturation ++ r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i mask, r32; ++ mask = _mm_set1_epi32(0xffff); ++ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16) ++ r32 = _mm_and_si128(r32, mask); //to avoid saturation ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res64; ++ __m128i r64; ++ r64 = vshrq_n_u64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++//************** Vector signed->unsigned narrowing saturating shift right by constant ******** ++//********************************************************************************************* ++uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8 ++{ ++ uint8x8_t res64; ++ __m128i r16; ++ r16 = vshrq_n_s16(a,b); ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i r32; ++ r32 = vshrq_n_s32(a,b); ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster ++{ ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ uint32x2_t res; ++ int64_t res64; ++ _mm_store_si128((__m128i*)atmp, a); ++ if (atmp[0] < 0) { ++ res.m64_u32[0] = 0; ++ } else { ++ res64 = (atmp[0] >> b); ++ res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64; ++ } ++ if (atmp[1] < 0) { ++ res.m64_u32[1] = 0; ++ } else { ++ res64 = (atmp[1] >> b); ++ res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64; ++ } ++ return res; ++} ++ ++//**** Vector signed->unsigned rounding narrowing saturating shift right by constant ***** ++uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8 ++{ ++ //solution may be not optimal compared with the serial one ++ __m128i r16; ++ uint8x8_t res64; ++ r16 = vrshrq_n_s16(a,b); ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16 ++{ ++ //solution may be not optimal compared with the serial one ++ __m128i r32; ++ uint16x4_t res64; ++ r32 = vrshrq_n_s32(a,b); ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster ++{ ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ uint32x2_t res; ++ int64_t res64; ++ _mm_store_si128((__m128i*)atmp, a); ++ if (atmp[0] < 0) { ++ res.m64_u32[0] = 0; ++ } else { ++ res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); ++ res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; ++ } ++ if (atmp[1] < 0) { ++ res.m64_u32[1] = 0; ++ } else { ++ res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); ++ res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; ++ } ++ return res; ++} ++ ++//***** Vector narrowing saturating shift right by constant ****** ++//***************************************************************** ++int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ r16 = vshrq_n_s16(a,b); ++ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ r32 = vshrq_n_s32(a,b); ++ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //no optimal SIMD solution found ++ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2]; ++ int32x2_t res; ++ _mm_store_si128((__m128i*)atmp, a); ++ res64[0] = (atmp[0] >> b); ++ res64[1] = (atmp[1] >> b); ++ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; ++ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; ++ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i r32; ++ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) ++{ ++ //serial solution may be faster ++ uint32x2_t res64; ++ __m128i r64, res_hi, zero; ++ zero = _mm_setzero_si128(); ++ r64 = vshrq_n_u64(a,b); ++ res_hi = _mm_srli_epi64(r64, 32); ++ res_hi = _mm_cmpgt_epi32(res_hi, zero); ++ r64 = _mm_or_si128(r64, res_hi); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++ ++//********* Vector rounding narrowing shift right by constant ************************* ++//**************************************************************************************** ++int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r16 = vrshrq_n_s16(a,b); ++ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r16); ++} ++ ++int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r32 = vrshrq_n_s32(a,b); ++ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r32); ++} ++ ++int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ __m128i r64; ++ r64 = vrshrq_n_s64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 ++{ ++ uint8x8_t res64; ++ __m128i mask, r16; ++ mask = _mm_set1_epi16(0xff); ++ r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_and_si128(r16, mask); //to avoid saturation ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i mask, r32; ++ mask = _mm_set1_epi32(0xffff); ++ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) ++ r32 = _mm_and_si128(r32, mask); //to avoid saturation ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster ++{ ++ uint32x2_t res64; ++ __m128i r64; ++ r64 = vrshrq_n_u64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++//************* Vector rounding narrowing saturating shift right by constant ************ ++//**************************************************************************************** ++int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ r16 = vrshrq_n_s16(a,b); ++ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ r32 = vrshrq_n_s32(a,b); ++ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //no optimal SIMD solution found ++ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2]; ++ int32x2_t res; ++ _mm_store_si128((__m128i*)atmp, a); ++ maskb[0] = atmp[0] & (( int64_t)1 << (b - 1)); ++ res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result ++ maskb[1] = atmp[1] & (( int64_t)1 << (b - 1)); ++ res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result ++ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; ++ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; ++ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i r32; ++ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) ++{ ++ //serial solution may be faster ++ uint32x2_t res64; ++ __m128i r64, res_hi, zero; ++ zero = _mm_setzero_si128(); ++ r64 = vrshrq_n_u64(a,b); ++ res_hi = _mm_srli_epi64(r64, 32); ++ res_hi = _mm_cmpgt_epi32(res_hi, zero); ++ r64 = _mm_or_si128(r64, res_hi); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++//************** Vector widening shift left by constant **************** ++//************************************************************************ ++int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 ++_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0 ++{ ++ __m128i r; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ return _mm_slli_epi16 (r, b); ++} ++ ++int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 ++_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0 ++{ ++ __m128i r; ++ r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1, ++ return _mm_slli_epi32 (r, b); ++} ++ ++int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 ++_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0 ++{ ++ __m128i r; ++ r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1, ++ return _mm_slli_epi64 (r, b); ++} ++ ++uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 ++_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0 ++{ ++ //no uint8 to uint16 conversion available, manual conversion used ++ __m128i zero, r; ++ zero = _mm_setzero_si128 (); ++ r = _mm_unpacklo_epi8(_pM128i(a), zero); ++ return _mm_slli_epi16 (r, b); ++} ++ ++uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0 ++_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0 ++{ ++ //no uint16 to uint32 conversion available, manual conversion used ++ __m128i zero, r; ++ zero = _mm_setzero_si128 (); ++ r = _mm_unpacklo_epi16(_pM128i(a), zero); ++ return _mm_slli_epi32 (r, b); ++} ++ ++uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 ++_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0 ++{ ++ //no uint32 to uint64 conversion available, manual conversion used ++ __m128i zero, r; ++ zero = _mm_setzero_si128 (); ++ r = _mm_unpacklo_epi32(_pM128i(a), zero); ++ return _mm_slli_epi64 (r, b); ++} ++ ++//************************************************************************************ ++//**************************** Shifts with insert ************************************ ++//************************************************************************************ ++//takes each element in a vector, shifts them by an immediate value, ++//and inserts the results in the destination vector. Bits shifted out of the each element are lost. ++ ++//**************** Vector shift right and insert ************************************ ++//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift. ++//All other bits are taken from b shifted. ++int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) ++{ ++ int8x8_t res64; ++ return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) ++{ ++ int16x4_t res64; ++ return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) ++{ ++ int32x2_t res64; ++ return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) ++{ ++ int64x1_t res; ++ if (c ==64) ++ res = a; ++ else{ ++ res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros ++ } ++ return res; ++} ++ ++uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++#define vsri_n_u8 vsri_n_s8 ++ ++uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++#define vsri_n_u16 vsri_n_s16 ++ ++uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++#define vsri_n_u32 vsri_n_s32 ++ ++ ++uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++#define vsri_n_u64 vsri_n_s64 ++ ++poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++#define vsri_n_p8 vsri_n_u8 ++ ++poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++#define vsri_n_p16 vsri_n_u16 ++ ++int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8 ++{ ++ __m128i maskA, a_masked; ++ uint8x16_t b_shift; ++ _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used ++ maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros ++ a_masked = _mm_and_si128 (a, maskA); ++ b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift ++ return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a) ++} ++ ++int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16 ++{ ++ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a ++ uint16x8_t b_shift; ++ uint16x8_t a_c; ++ b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift ++ a_c = vshrq_n_u16( a, (16 - c)); ++ a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a ++ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) ++} ++ ++int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32 ++{ ++ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a ++ uint32x4_t b_shift; ++ uint32x4_t a_c; ++ b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift ++ a_c = vshrq_n_u32( a, (32 - c)); ++ a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a ++ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) ++} ++ ++int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) ++{ ++ //serial solution may be faster ++ uint64x2_t b_shift; ++ uint64x2_t a_c; ++ b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift ++ a_c = _mm_srli_epi64(a, (64 - c)); ++ a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a ++ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) ++} ++ ++uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++#define vsriq_n_u8 vsriq_n_s8 ++ ++uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++#define vsriq_n_u16 vsriq_n_s16 ++ ++uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++#define vsriq_n_u32 vsriq_n_s32 ++ ++uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++#define vsriq_n_u64 vsriq_n_s64 ++ ++poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++#define vsriq_n_p8 vsriq_n_u8 ++ ++poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++#define vsriq_n_p16 vsriq_n_u16 ++ ++//***** Vector shift left and insert ********************************************* ++//********************************************************************************* ++//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift. ++//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted". ++int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c) ++{ ++ int8x8_t res64; ++ return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c) ++{ ++ int16x4_t res64; ++ return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c) ++{ ++ int32x2_t res64; ++ return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c)); ++} ++ ++int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros ++ return res; ++} ++ ++ ++uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++#define vsli_n_u8 vsli_n_s8 ++ ++uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++#define vsli_n_u16 vsli_n_s16 ++ ++uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++#define vsli_n_u32 vsli_n_s32 ++ ++uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++#define vsli_n_u64 vsli_n_s64 ++ ++poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++#define vsli_n_p8 vsli_n_u8 ++ ++poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++#define vsli_n_p16 vsli_n_u16 ++ ++int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0 ++{ ++ __m128i maskA, a_masked; ++ int8x16_t b_shift; ++ _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask ++ maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones ++ b_shift = vshlq_n_s8( b, c); ++ a_masked = _mm_and_si128 (a, maskA); ++ return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a) ++} ++ ++int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0 ++{ ++ //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a ++ int16x8_t b_shift; ++ int16x8_t a_c; ++ b_shift = vshlq_n_s16( b, c); ++ a_c = vshlq_n_s16( a, (16 - c)); ++ a_c = _mm_srli_epi16(a_c, (16 - c)); ++ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) ++} ++ ++int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0 ++{ ++ //solution may be not optimal compared with the serial one ++ //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a ++ int32x4_t b_shift; ++ int32x4_t a_c; ++ b_shift = vshlq_n_s32( b, c); ++ a_c = vshlq_n_s32( a, (32 - c)); ++ a_c = _mm_srli_epi32(a_c, (32 - c)); ++ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) ++} ++ ++int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0 ++{ ++ //solution may be not optimal compared with the serial one ++ //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a ++ int64x2_t b_shift; ++ int64x2_t a_c; ++ b_shift = vshlq_n_s64( b, c); ++ a_c = vshlq_n_s64( a, (64 - c)); ++ a_c = _mm_srli_epi64(a_c, (64 - c)); ++ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) ++} ++ ++uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++#define vsliq_n_u8 vsliq_n_s8 ++ ++uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++#define vsliq_n_u16 vsliq_n_s16 ++ ++uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++#define vsliq_n_u32 vsliq_n_s32 ++ ++uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++#define vsliq_n_u64 vsliq_n_s64 ++ ++poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++#define vsliq_n_p8 vsliq_n_u8 ++ ++poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++#define vsliq_n_p16 vsliq_n_u16 ++ ++// *********************************************************************************************** ++// ****************** Loads and stores of a single vector *************************************** ++// *********************************************************************************************** ++//Performs loads and stores of a single vector of some type. ++//******************************* Loads ******************************************************** ++// *********************************************************************************************** ++//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);. ++//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous. ++// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access ++//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead; ++#define LOAD_SI128(ptr) \ ++ ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)); ++ ++uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++#define vld1q_u8 LOAD_SI128 ++ ++uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++#define vld1q_u16 LOAD_SI128 ++ ++uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++#define vld1q_u32 LOAD_SI128 ++ ++uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld1q_u64 LOAD_SI128 ++ ++int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++#define vld1q_s8 LOAD_SI128 ++ ++int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++#define vld1q_s16 LOAD_SI128 ++ ++int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++#define vld1q_s32 LOAD_SI128 ++ ++int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld1q_s64 LOAD_SI128 ++ ++float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers ++/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0] ++{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); ++__m128 f2; ++f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]); ++}*/ ++ ++float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr) ++{ ++ if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned ++ return _mm_load_ps(ptr); ++ else ++ return _mm_loadu_ps(ptr); ++} ++ ++poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++#define vld1q_p8 LOAD_SI128 ++ ++poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++#define vld1q_p16 LOAD_SI128 ++ ++uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] ++#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr)) ++ ++uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] ++#define vld1_u16 vld1_u8 ++ ++uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] ++#define vld1_u32 vld1_u8 ++ ++ ++uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1_u64 vld1_u8 ++ ++int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] ++#define vld1_s8 vld1_u8 ++ ++int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] ++#define vld1_s16 vld1_u16 ++ ++int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] ++#define vld1_s32 vld1_u32 ++ ++int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1_s64 vld1_u64 ++ ++float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); ++ ++float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] ++_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = *(ptr); ++ res.m64_f32[1] = *(ptr + 1); ++ return res; ++} ++ ++poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] ++#define vld1_p8 vld1_u8 ++ ++poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] ++#define vld1_p16 vld1_u16 ++ ++//*********************************************************************************************************** ++//******* Lane load functions - insert the data at vector's given position (lane) ************************* ++//*********************************************************************************************************** ++uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) ++ ++uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) ++ ++uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) ++ ++uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] ++#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p; ++ ++ ++int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) ++ ++int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) ++ ++int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) ++ ++float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane) ++{ ++ //we need to deal with ptr 16bit NOT aligned case ++ __m128 p; ++ p = _mm_set1_ps(*(ptr)); ++ return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane)); ++} ++ ++int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] ++#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane) ++ ++poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) ++ ++poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) ++ ++uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] ++_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane) ++{ ++ uint8x8_t res; ++ res = vec; ++ res.m64_u8[lane] = *(ptr); ++ return res; ++} ++ ++uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane) ++{ ++ uint16x4_t res; ++ res = vec; ++ res.m64_u16[lane] = *(ptr); ++ return res; ++} ++ ++uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane) ++{ ++ uint32x2_t res; ++ res = vec; ++ res.m64_u32[lane] = *(ptr); ++ return res; ++} ++ ++uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] ++_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = *(ptr); ++ return res; ++} ++ ++ ++int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane) ++ ++int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane) ++ ++int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane) ++ ++float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane) ++{ ++ float32x2_t res; ++ res = vec; ++ res.m64_f32[lane] = *(ptr); ++ return res; ++} ++ ++int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] ++#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane) ++ ++poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1_lane_p8 vld1_lane_u8 ++ ++poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1_lane_p16 vld1_lane_s16 ++ ++// ****************** Load single value ( set all lanes of vector with same value from memory)********************** ++// ****************************************************************************************************************** ++uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr)) ++ ++uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr)) ++ ++uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr)) ++ ++uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)}; ++ return LOAD_SI128(val); ++} ++ ++int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr)) ++ ++int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr)) ++ ++int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr)) ++ ++int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr) ++ ++float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++//current IA SIMD doesn't support float16, need to go to 32 bits ++ ++float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr)) ++ ++poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr)) ++ ++poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr)) ++ ++uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint8x8_t res; ++ int i; ++ for(i = 0; i<8; i++) { ++ res.m64_u8[i] = *(ptr); ++ } ++ return res; ++} ++ ++uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint16x4_t res; ++ int i; ++ for(i = 0; i<4; i++) { ++ res.m64_u16[i] = *(ptr); ++ } ++ return res; ++} ++ ++uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = *(ptr); ++ res.m64_u32[1] = *(ptr); ++ return res; ++} ++ ++uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = *(ptr); ++ return res; ++} ++ ++int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr) ++ ++ ++int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr) ++ ++ ++int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr) ++ ++ ++int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr) ++ ++float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = *(ptr); ++ res.m64_f32[1] = res.m64_f32[0]; ++ return res; // use last 64bits only ++} ++ ++poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1_dup_p8 vld1_dup_u8 ++ ++ ++poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1_dup_p16 vld1_dup_u16 ++ ++ ++//************************************************************************************* ++//********************************* Store ********************************************** ++//************************************************************************************* ++// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val); ++//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro ++#define STORE_SI128(ptr, val) \ ++ (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); ++ ++void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] ++#define vst1q_u8 STORE_SI128 ++ ++void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] ++#define vst1q_u16 STORE_SI128 ++ ++void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] ++#define vst1q_u32 STORE_SI128 ++ ++void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] ++#define vst1q_u64 STORE_SI128 ++ ++void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] ++#define vst1q_s8 STORE_SI128 ++ ++void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] ++#define vst1q_s16 STORE_SI128 ++ ++void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] ++#define vst1q_s32 STORE_SI128 ++ ++void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] ++#define vst1q_s64 STORE_SI128 ++ ++void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val) ++{ ++ if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned ++ _mm_store_ps (ptr, val); ++ else ++ _mm_storeu_ps (ptr, val); ++} ++ ++void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] ++#define vst1q_p8 vst1q_u8 ++ ++void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] ++#define vst1q_p16 vst1q_u16 ++ ++void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val) ++{ ++ int i; ++ for (i = 0; i<8; i++) { ++ *(ptr + i) = ((uint8_t*)&val)[i]; ++ } ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val) ++{ ++ int i; ++ for (i = 0; i<4; i++) { ++ *(ptr + i) = ((uint16_t*)&val)[i]; ++ } ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val) ++{ ++ int i; ++ for (i = 0; i<2; i++) { ++ *(ptr + i) = ((uint32_t*)&val)[i]; ++ } ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val) ++{ ++ *(ptr) = *((uint64_t*)&val); ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] ++#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val) ++ ++void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] ++#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val) ++ ++void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] ++#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val) ++ ++void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] ++#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val) ++ ++void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val) ++{ ++ *(ptr) = val.m64_f32[0]; ++ *(ptr + 1) = val.m64_f32[1]; ++ return; ++} ++ ++void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] ++#define vst1_p8 vst1_u8 ++ ++void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] ++#define vst1_p16 vst1_u16 ++ ++//***********Store a lane of a vector into memory (extract given lane) ********************* ++//****************************************************************************************** ++void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) ++ ++void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) ++ ++void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] ++#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) ++ ++void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] ++#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) ++ ++void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) ++ ++void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) ++ ++void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] ++#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) ++ ++void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] ++#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) ++ ++void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane) ++{ ++ int32_t ilane; ++ ilane = _MM_EXTRACT_PS(val,lane); ++ *(ptr) = *((float*)&ilane); ++} ++ ++void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1q_lane_p8 vst1q_lane_u8 ++ ++void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1q_lane_p16 vst1q_lane_s16 ++ ++void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane) ++{ ++ *(ptr) = val.m64_u8[lane]; ++} ++ ++void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val.m64_u16[lane]; ++} ++ ++void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val.m64_u32[lane]; ++} ++ ++void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane) ++{ ++ *(ptr) = val.m64_u64[0]; ++} ++ ++void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane) ++ ++void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane) ++ ++void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] ++#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane) ++ ++ ++void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] ++#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane) ++ ++ ++void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val.m64_f32[lane]; ++} ++ ++void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1_lane_p8 vst1_lane_u8 ++ ++void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1_lane_p16 vst1_lane_s16 ++ ++//*********************************************************************************************** ++//**************** Loads and stores of an N-element structure ********************************** ++//*********************************************************************************************** ++//These intrinsics load or store an n-element structure. The array structures are defined in the beginning ++//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions" ++//****************** 2 elements load ********************************************* ++uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0] ++{ ++ uint8x16x2_t v; ++ v.val[0] = vld1q_u8(ptr); ++ v.val[1] = vld1q_u8((ptr + 16)); ++ v = vuzpq_s8(v.val[0], v.val[1]); ++ return v; ++} ++ ++uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0] ++{ ++ uint16x8x2_t v; ++ v.val[0] = vld1q_u16( ptr); ++ v.val[1] = vld1q_u16( (ptr + 8)); ++ v = vuzpq_s16(v.val[0], v.val[1]); ++ return v; ++} ++ ++uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0] ++{ ++ uint32x4x2_t v; ++ v.val[0] = vld1q_u32 ( ptr); ++ v.val[1] = vld1q_u32 ( (ptr + 4)); ++ v = vuzpq_s32(v.val[0], v.val[1]); ++ return v; ++} ++ ++int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); ++#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr) ++ ++int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr) ++ ++int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr) ++ ++ ++float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0] ++{ ++ float32x4x2_t v; ++ v.val[0] = vld1q_f32 (ptr); ++ v.val[1] = vld1q_f32 ((ptr + 4)); ++ v = vuzpq_f32(v.val[0], v.val[1]); ++ return v; ++} ++ ++poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++#define vld2q_p8 vld2q_u8 ++ ++poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++#define vld2q_p16 vld2q_u16 ++ ++uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr) ++{ ++ uint8x8x2_t v; ++ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; ++ __m128i ld128; ++ ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit ++ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd); ++ vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); ++ return v; ++} ++ ++uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr) ++{ ++ _NEON2SSE_ALIGN_16 uint16x4x2_t v; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; ++ __m128i ld128; ++ ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit ++ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd); ++ vst1q_u16((v.val), ld128); ++ return v; ++} ++ ++uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr) ++{ ++ _NEON2SSE_ALIGN_16 uint32x2x2_t v; ++ __m128i ld128; ++ ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit ++ ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ vst1q_u32((v.val), ld128); ++ return v; ++} ++ ++uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr) ++{ ++ uint64x1x2_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ return v; ++} ++ ++int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr) ++ ++int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr) ++ ++int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr) ++ ++int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr) ++ ++float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example ++ ++float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr) ++{ ++ float32x2x2_t v; ++ v.val[0].m64_f32[0] = *(ptr); ++ v.val[0].m64_f32[1] = *(ptr + 2); ++ v.val[1].m64_f32[0] = *(ptr + 1); ++ v.val[1].m64_f32[1] = *(ptr + 3); ++ return v; ++} ++ ++poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++#define vld2_p8 vld2_u8 ++ ++poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++#define vld2_p16 vld2_u16 ++ ++//******************** Triplets *************************************** ++//********************************************************************* ++uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0] ++{ ++ //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 -> ++ //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13 ++ //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14, ++ //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15 ++ uint8x16x3_t v; ++ __m128i tmp0, tmp1,tmp2, tmp3; ++ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14}; ++ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13}; ++ _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15}; ++ ++ v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15 ++ v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15 ++ v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15 ++ ++ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11 ++ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13 ++ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15 ++ ++ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15 ++ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x ++ tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, ++ tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 ++ v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, ++ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13, ++ ++ tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, ++ tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 ++ v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 ++ v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13, ++ v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, ++ v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 ++ tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 ++ tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14, ++ ++ tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, ++ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, ++ v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 ++ v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 ++ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, ++ tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, ++ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15, ++ return v; ++} ++ ++uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0] ++{ ++ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 ++ uint16x8x3_t v; ++ __m128i tmp0, tmp1,tmp2, tmp3; ++ _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; ++ _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13}; ++ _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15}; ++ ++ v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7, ++ v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7 ++ v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7 ++ ++ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5, ++ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6 ++ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7 ++ ++ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6, ++ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x ++ tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7 ++ tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0 ++ v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5, ++ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5 ++ ++ tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7 ++ tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0 ++ v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0 ++ v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6, ++ v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5, ++ v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0, ++ tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0 ++ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6, ++ ++ tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0 ++ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7, ++ v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0 ++ v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0 ++ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7, ++ tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0 ++ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7, ++ return v; ++} ++ ++uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, ++ uint32x4x3_t v; ++ __m128i tmp0, tmp1,tmp2, tmp3; ++ v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3, ++ v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3 ++ v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3, ++ ++ tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2 ++ tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1 ++ tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3 ++ ++ tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2 ++ v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1 ++ tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1 ++ v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0, ++ v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2 ++ v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3 ++ return v; ++} ++ ++int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr)) ++ ++int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr)) ++ ++int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr)) ++ ++float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, ++ float32x4x3_t v; ++ __m128 tmp0, tmp1,tmp2, tmp3; ++ v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3, ++ v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3 ++ v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3, ++ ++ tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2 ++ tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1 ++ tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3 ++ tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2 ++ ++ v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1 ++ tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1 ++ v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0, ++ v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2 ++ v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3 ++ return v; ++} ++ ++poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++#define vld3q_p8 vld3q_u8 ++ ++poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++#define vld3q_p16 vld3q_u16 ++ ++uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0] ++{ ++ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 ++ uint8x8x3_t v; ++ __m128i val0, val1, val2, tmp0, tmp1; ++ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14}; ++ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0}; ++ val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7 ++ val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7 ++ ++ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6, ++ tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x ++ val0 = _mm_slli_si128(tmp0,10); ++ val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0 ++ val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x ++ val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x ++ _M64(v.val[0], val0); ++ val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5, ++ val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0, ++ val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0 ++ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0 ++ val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x ++ _M64(v.val[1], val1); ++ ++ tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0, ++ val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0 ++ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7, ++ val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0] ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, ++ uint16x4x3_t v; ++ __m128i val0, val1, val2, tmp0, tmp1; ++ _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; ++ val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3 ++ val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x ++ ++ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1 ++ tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3, ++ val0 = _mm_slli_si128(tmp0,10); ++ val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0, ++ val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1 ++ val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0 ++ val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x ++ _M64(v.val[0], val0); ++ ++ val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3 ++ val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0, ++ val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0, ++ val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0 ++ val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x ++ _M64(v.val[1], val1); ++ ++ tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0 ++ tmp1 = _mm_srli_si128(tmp1,4); ++ tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3, ++ val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3, ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0] ++{ ++ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 ++ uint32x2x3_t v; ++ __m128i val0, val1, val2; ++ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, ++ val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x ++ ++ val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0 ++ _M64(v.val[0], val0); ++ val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1, ++ val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1 ++ _M64(v.val[1], val1); ++ val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x, ++ _M64(v.val[2], val2); ++ return v; ++} ++uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] ++{ ++ uint64x1x3_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ v.val[2].m64_u64[0] = *(ptr + 2); ++ return v; ++} ++ ++int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr) ++ ++int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr) ++ ++int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr) ++ ++int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr) ++ ++float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr) ++{ ++ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 ++ float32x2x3_t v; ++ v.val[0].m64_f32[0] = *(ptr); ++ v.val[0].m64_f32[1] = *(ptr + 3); ++ ++ v.val[1].m64_f32[0] = *(ptr + 1); ++ v.val[1].m64_f32[1] = *(ptr + 4); ++ ++ v.val[2].m64_f32[0] = *(ptr + 2); ++ v.val[2].m64_f32[1] = *(ptr + 5); ++ return v; ++} ++ ++poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++#define vld3_p8 vld3_u8 ++ ++poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++#define vld3_p16 vld3_u16 ++ ++//*************** Quadruples load ******************************** ++//***************************************************************** ++uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0] ++{ ++ uint8x16x4_t v; ++ __m128i tmp3, tmp2, tmp1, tmp0; ++ ++ v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15 ++ v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15 ++ v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15 ++ v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15 ++ ++ tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7 ++ tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7 ++ tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15 ++ tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15 ++ ++ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11 ++ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15 ++ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11 ++ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15 ++ ++ tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9 ++ tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11 ++ tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13, ++ tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15 ++ ++ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12 ++ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13 ++ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14 ++ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15 ++ return v; ++} ++ ++uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0] ++{ ++ uint16x8x4_t v; ++ __m128i tmp3, tmp2, tmp1, tmp0; ++ tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7 ++ tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7 ++ tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7 ++ tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7 ++ v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3, ++ v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3, ++ v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7 ++ v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7 ++ tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5 ++ tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7 ++ tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5 ++ tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7 ++ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4, ++ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5 ++ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6, ++ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7 ++ return v; ++} ++ ++uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] ++{ ++ uint32x4x4_t v; ++ __m128i tmp3, tmp2, tmp1, tmp0; ++ v.val[0] = vld1q_u32 (ptr); ++ v.val[1] = vld1q_u32 ((ptr + 4)); ++ v.val[2] = vld1q_u32 ((ptr + 8)); ++ v.val[3] = vld1q_u32 ((ptr + 12)); ++ tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]); ++ tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]); ++ tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]); ++ tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]); ++ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1); ++ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1); ++ v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3); ++ v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3); ++ return v; ++} ++ ++int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr) ++ ++int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr) ++ ++int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr) ++ ++float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] ++{ ++ float32x4x4_t v; ++ __m128 tmp3, tmp2, tmp1, tmp0; ++ ++ v.val[0] = vld1q_f32 ((float*) ptr); ++ v.val[1] = vld1q_f32 ((float*) (ptr + 4)); ++ v.val[2] = vld1q_f32 ((float*) (ptr + 8)); ++ v.val[3] = vld1q_f32 ((float*) (ptr + 12)); ++ tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]); ++ tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]); ++ tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]); ++ tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]); ++ v.val[0] = _mm_movelh_ps(tmp0, tmp2); ++ v.val[1] = _mm_movehl_ps(tmp2, tmp0); ++ v.val[2] = _mm_movelh_ps(tmp1, tmp3); ++ v.val[3] = _mm_movehl_ps(tmp3, tmp1); ++ return v; ++} ++ ++poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++#define vld4q_p8 vld4q_u8 ++ ++poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++#define vld4q_p16 vld4q_s16 ++ ++uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0] ++{ ++ uint8x8x4_t v; ++ __m128i sh0, sh1; ++ __m128i val0, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; ++ ++ val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1] ++ val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3] ++ ++ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8); ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8); ++ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29 ++ vst1q_u8(&v.val[0], val0 ); ++ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31 ++ vst1q_u8(&v.val[2], val2 ); ++ return v; ++} ++ ++uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0] ++{ ++ uint16x4x4_t v; ++ __m128i sh0, sh1; ++ __m128i val0, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7 ++ val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1] ++ val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3] ++ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16); ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16); ++ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13 ++ vst1q_u16(&v.val[0], val0 ); ++ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15 ++ vst1q_u16(&v.val[2], val2 ); ++ return v; ++} ++ ++uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr) ++{ ++ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 ++ uint32x2x4_t v; ++ __m128i val0, val01, val2; ++ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, ++ val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1 ++ val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1, ++ val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1 ++ vst1q_u32(&v.val[0], val01); ++ vst1q_u32(&v.val[2], val2 ); ++ return v; ++} ++ ++uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] ++{ ++ uint64x1x4_t v; ++ v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1] ++ v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1] ++ v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3] ++ v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3] ++ return v; ++} ++ ++int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr) ++ ++int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr) ++ ++int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr) ++ ++int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr) ++ ++float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0] ++{ ++ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 ++ float32x2x4_t res; ++ res.val[0].m64_f32[0] = *(ptr); ++ res.val[0].m64_f32[1] = *(ptr + 4); ++ res.val[1].m64_f32[0] = *(ptr + 1); ++ res.val[1].m64_f32[1] = *(ptr + 5); ++ res.val[2].m64_f32[0] = *(ptr + 2); ++ res.val[2].m64_f32[1] = *(ptr + 6); ++ res.val[3].m64_f32[0] = *(ptr + 3); ++ res.val[3].m64_f32[1] = *(ptr + 7); ++ return res; ++} ++ ++poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++#define vld4_p8 vld4_u8 ++ ++poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++#define vld4_p16 vld4_u16 ++ ++//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes ******************* ++//******************************************************************************************************************* ++uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0] ++{ ++ uint8x8x2_t v; ++ __m128i val0, val1; ++ val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x ++ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x, ++ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x ++ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, ++ vst1q_u8(v.val, val0); ++ return v; ++} ++ ++uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0] ++{ ++ uint16x4x2_t v; ++ __m128i val0, val1; ++ val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x ++ val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0) ++ _M64(v.val[0], val0); ++ val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1) ++ _M64(v.val[1], val1); ++ return v; ++} ++ ++uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] ++{ ++ uint32x2x2_t v; ++ __m128i val0; ++ val0 = LOAD_SI128(ptr); //0,1,x,x ++ val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1 ++ vst1q_u32(v.val, val0); ++ return v; ++} ++ ++uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld2_dup_u64 vld2_u64 ++ ++int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr) ++ ++int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr) ++ ++int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr) ++ ++int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr) ++ ++float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] ++{ ++ float32x2x2_t v; ++ v.val[0].m64_f32[0] = *(ptr); //0,0 ++ v.val[0].m64_f32[1] = *(ptr); //0,0 ++ v.val[1].m64_f32[0] = *(ptr + 1); //1,1 ++ v.val[1].m64_f32[1] = *(ptr + 1); //1,1 ++ return v; ++} ++ ++poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++#define vld2_dup_p8 vld2_dup_u8 ++ ++poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++#define vld2_dup_p16 vld2_dup_s16 ++ ++//************* Duplicate (or propagate)triplets: ******************* ++//******************************************************************** ++//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes ++uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0] ++{ ++ uint8x8x3_t v; ++ __m128i val0, val1, val2; ++ val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x ++ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x, ++ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x, ++ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, ++ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x, ++ vst1q_u8(v.val, val0); ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0] ++{ ++ uint16x4x3_t v; ++ __m128i val0, val1, val2; ++ val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x ++ val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0) ++ val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1) ++ val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2) ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] ++{ ++ uint32x2x3_t v; ++ __m128i val0, val1, val2; ++ val2 = LOAD_SI128(ptr); //0,1,2,x ++ val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2 ++ val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2 ++ val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0 ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] ++{ ++ uint64x1x3_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ v.val[2].m64_u64[0] = *(ptr + 2); ++ return v; ++} ++ ++int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr) ++ ++int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr) ++ ++int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr) ++ ++int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr) ++ ++ ++float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] ++{ ++ float32x2x3_t v; ++ int i; ++ for (i = 0; i<3; i++) { ++ v.val[i].m64_f32[0] = *(ptr + i); ++ v.val[i].m64_f32[1] = *(ptr + i); ++ } ++ return v; ++} ++ ++poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_p8 vld3_dup_u8 ++ ++poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_p16 vld3_dup_s16 ++ ++ ++//************* Duplicate (or propagate) quadruples: ******************* ++//*********************************************************************** ++//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes ++uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ uint8x8x4_t v; ++ __m128i val0, val1, val2; ++ val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x ++ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x, ++ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3 ++ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, ++ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3 ++ vst1q_u8(&v.val[0], val0); ++ vst1q_u8(&v.val[2], val2); ++ return v; ++} ++ ++uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ uint16x4x4_t v; ++ __m128i val0, val1, val2, val3; ++ val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x ++ val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0) ++ val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1) ++ val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2) ++ val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3) ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ _M64(v.val[3], val3); ++ return v; ++} ++ ++uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ uint32x2x4_t v; ++ __m128i val0, val1, val2, val3; ++ val3 = LOAD_SI128(ptr); //0,1,2,3 ++ val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3 ++ val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3 ++ val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3 ++ val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2 ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ _M64(v.val[3], val3); ++ return v; ++} ++ ++uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] ++{ ++ uint64x1x4_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ v.val[2].m64_u64[0] = *(ptr + 2); ++ v.val[3].m64_u64[0] = *(ptr + 3); ++ return v; ++} ++ ++int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr) ++ ++int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr) ++ ++int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr) ++ ++int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr) ++ ++float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ float32x2x4_t v; ++ int i; ++ for (i = 0; i<4; i++) { ++ v.val[i].m64_f32[0] = *(ptr + i); ++ v.val[i].m64_f32[1] = *(ptr + i); ++ } ++ return v; ++} ++ ++poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_p8 vld4_dup_u8 ++ ++poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_p16 vld4_dup_u16 ++ ++ ++//********************************************************************************** ++//*******************Lane loads for an N-element structures *********************** ++//********************************************************************************** ++//********************** Lane pairs ************************************************ ++//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon ++//we assume src is 16 bit aligned ++ ++//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error ++//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined ++ ++//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0] ++{ ++ uint16x8x2_t v; ++ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); ++ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] ++{ ++ uint32x4x2_t v; ++ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); ++ return v; ++} ++#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane) ++ ++//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane) ++{ ++ int16x8x2_t v; ++ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); ++ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane) ++ ++//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane) ++{ ++ int32x4x2_t v; ++ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); ++ return v; ++} ++#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane) ++ ++//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] ++{ ++ float32x4x2_t v; ++ v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane) ++ ++//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++#define vld2q_lane_p16 vld2q_lane_u16 ++ ++//uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0] ++{ ++ uint8x8x2_t v; ++ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane) ++ ++//uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane) ++{ ++ uint16x4x2_t v; ++ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane) ++{ ++ uint32x2x2_t v; ++ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane) ++ ++//int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] ++int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] ++#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane) ++ ++//int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0] ++int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane) ++ ++//int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0] ++int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane) ++ ++//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane) ++{ ++ float32x2x2_t v; ++ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane) ++ ++//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] ++poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] ++#define vld2_lane_p8 vld2_lane_u8 ++ ++//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] ++poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++#define vld2_lane_p16 vld2_lane_u16 ++ ++//*********** Lane triplets ********************** ++//************************************************* ++//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon ++//we assume src is 16 bit aligned ++ ++//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ uint16x8x3_t v; ++ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ uint32x4x3_t v; ++ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane) ++ ++//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ int16x8x3_t v; ++ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane) ++ ++//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ int32x4x3_t v; ++ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane) ++ ++float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++//current IA SIMD doesn't support float16 ++#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane) ++ ++ ++//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ float32x4x3_t v; ++ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); ++ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); ++ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); ++ return v; ++} ++#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane) ++ ++poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++#define vld3q_lane_p16 vld3q_lane_u16 ++ ++//uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ uint8x8x3_t v; ++ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); ++ return v; ++} ++#define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane) ++ ++//uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ uint16x4x3_t v; ++ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); ++ return v; ++} ++#define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ //need to merge into 128 bit anyway ++ uint32x2x3_t v; ++ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);; ++ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);; ++ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);; ++ return v; ++} ++#define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane) ++ ++int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane) ++ ++int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane) ++ ++int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane) ++ ++float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ float32x2x3_t v; ++ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); ++ return v; ++} ++#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane) ++ ++//poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_p8 vld3_lane_u8 ++ ++//poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_p16 vld3_lane_u16 ++ ++//******************* Lane Quadruples load *************************** ++//********************************************************************* ++//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon ++//we assume src is 16 bit aligned ++ ++//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane) ++{ ++ uint16x8x4_t v; ++ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); ++ v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane); ++ return v; ++} ++#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane) ++{ ++ uint32x4x4_t v; ++ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); ++ v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane); ++ return v; ++} ++#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane) ++ ++//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane) ++ ++//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane) ++ ++//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane) ++{ ++ float32x4x4_t v; ++ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); ++ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); ++ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); ++ v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane); ++ return v; ++} ++#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane) ++ ++//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++#define vld4q_lane_p16 vld4q_lane_u16 ++ ++//uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane) ++{ ++ uint8x8x4_t v; ++ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane) ++ ++//uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane) ++{ ++ uint16x4x4_t v; ++ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane) ++{ ++ uint32x2x4_t v; ++ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane) ++ ++//int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); ++#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane) ++ ++//int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); ++#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane) ++ ++//int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); ++#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane) ++ ++//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); ++//current IA SIMD doesn't support float16 ++ ++//float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane) ++{ ++ //serial solution may be faster ++ float32x2x4_t v; ++ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane) ++ ++//poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); ++#define vld4_lane_p8 vld4_lane_u8 ++ ++//poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); ++#define vld4_lane_p16 vld4_lane_u16 ++ ++//******************* Store duplets ********************************************* ++//******************************************************************************** ++//here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function ++//If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions ++//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val) ++{ ++ uint8x16x2_t v; ++ v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]); ++ vst1q_u8 (ptr, v.val[0]); ++ vst1q_u8 ((ptr + 16), v.val[1]); ++} ++#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val) ++ ++//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val) ++{ ++ uint16x8x2_t v; ++ v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]); ++ vst1q_u16 (ptr, v.val[0]); ++ vst1q_u16 ((ptr + 8), v.val[1]); ++} ++#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val) ++ ++//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val) ++{ ++ uint32x4x2_t v; ++ v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]); ++ vst1q_u32 (ptr, v.val[0]); ++ vst1q_u32 ((ptr + 4), v.val[1]); ++} ++#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val) ++ ++//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0] ++void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); ++#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val) ++ ++//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0] ++void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); ++#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val) ++ ++//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0] ++void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); ++#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val) ++ ++//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0] ++void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val) ++{ ++ float32x4x2_t v; ++ v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]); ++ vst1q_f32 (ptr, v.val[0]); ++ vst1q_f32 ((ptr + 4), v.val[1]); ++} ++#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val) ++ ++//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0] ++void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); ++#define vst2q_p8 vst2q_u8 ++ ++//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0] ++void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); ++#define vst2q_p16 vst2q_u16 ++ ++//void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val) ++{ ++ __m128i v0; ++ v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ vst1q_u8 (ptr, v0); ++} ++#define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val) ++ ++//void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val) ++{ ++ __m128i v0; ++ v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ vst1q_u16 (ptr, v0); ++} ++#define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val) ++ ++//void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val) ++{ ++ __m128i v0; ++ v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ vst1q_u32 (ptr, v0); ++} ++#define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val) ++ ++ ++//void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0] ++void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); ++_NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val) ++{ ++ *(ptr) = val->val[0].m64_u64[0]; ++ *(ptr + 1) = val->val[1].m64_u64[0]; ++} ++#define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val) ++ ++//void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0] ++#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val) ++ ++//void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0] ++#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val) ++ ++//void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0] ++#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val) ++ ++//void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); ++#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val) ++ ++//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val) ++{ ++ *(ptr) = val->val[0].m64_f32[0]; ++ *(ptr + 1) = val->val[1].m64_f32[0]; ++ *(ptr + 2) = val->val[0].m64_f32[1]; ++ *(ptr + 3) = val->val[1].m64_f32[1]; ++} ++#define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val) ++ ++//void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++#define vst2_p8 vst2_u8 ++ ++//void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++#define vst2_p16 vst2_u16 ++ ++//******************** Triplets store ***************************************** ++//****************************************************************************** ++//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val) ++{ ++ uint8x16x3_t v; ++ __m128i v0,v1,v2, cff, bldmask; ++ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10}; ++ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15}; ++ ++ v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22 ++ v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46 ++ v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34 ++ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding ++ cff = _mm_cmpeq_epi8(v0, v0); //all ff ++ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff); ++ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); ++ vst1q_u8(ptr, v.val[0]); ++ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff); ++ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); ++ vst1q_u8((ptr + 16), v.val[1]); ++ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff); ++ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); ++ vst1q_u8((ptr + 32), v.val[2]); ++} ++#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val) ++ ++//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val) ++{ ++ uint16x8x3_t v; ++ __m128i v0,v1,v2, cff, bldmask; ++ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11}; ++ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15}; ++ ++ v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10 ++ v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22, ++ v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19 ++ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding ++ cff = _mm_cmpeq_epi16(v0, v0); //all ff ++ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff); ++ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); ++ vst1q_u16(ptr, v.val[0]); ++ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff); ++ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); ++ vst1q_u16((ptr + 8), v.val[1]); ++ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff); ++ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); ++ vst1q_u16((ptr + 16), v.val[2]); ++} ++#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val) ++ ++//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val) ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3 ++ uint32x4x3_t v; ++ __m128i tmp0, tmp1,tmp2; ++ tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1 ++ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3 ++ tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1 ++ v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2, ++ v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3 ++ v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3 ++ tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1 ++ v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1, ++ ++ vst1q_u32(ptr, v.val[0]); ++ vst1q_u32((ptr + 4), v.val[1]); ++ vst1q_u32((ptr + 8), v.val[2]); ++} ++#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val) ++ ++//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val); ++void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); ++#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val) ++ ++//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val); ++void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); ++#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val) ++ ++//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val); ++void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); ++#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val) ++ ++//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] ++void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val) ++{ ++ float32x4x3_t v; ++ __m128 tmp0, tmp1,tmp2; ++ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1 ++ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3 ++ tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1 ++ v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2, ++ v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3 ++ v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3 ++ tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1 ++ v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1, ++ ++ vst1q_f32( ptr, v.val[0]); ++ vst1q_f32( (ptr + 4), v.val[1]); ++ vst1q_f32( (ptr + 8), v.val[2]); ++} ++#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val) ++ ++//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0] ++void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); ++#define vst3q_p8 vst3q_u8 ++ ++//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] ++void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); ++#define vst3q_p16 vst3q_u16 ++ ++//void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val) ++{ ++ __m128i tmp, sh0, sh1, val0, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5}; ++ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0}; ++ _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0}; ++ _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0}; ++ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) ); ++ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15) ++ val2 = _pM128i(val->val[2]); ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); ++ val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel); ++ vst1q_u8(ptr, val0); //store as 128 bit structure ++ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15) ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); ++ val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel); ++ _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory ++} ++#define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val) ++ ++//void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val) ++{ ++ __m128i tmp, val0, val1, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13}; ++ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0}; ++ _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1] ++ _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0] ++ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); ++ val2 = _pM128i(val->val[2]); ++ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); ++ val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f); ++ vst1q_u16(ptr, val0); //store as 128 bit structure ++ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); ++ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); ++ val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order ++ _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory ++} ++#define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val) ++ ++//void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val) ++{ ++ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; ++ __m128i val0, val1; ++ val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5 ++ val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5 ++ val1 = _mm_srli_si128(val0, 8); //4,5, x,x ++ _M64((*(__m64_128*)(ptr + 4)), val1); ++ val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2 ++ val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3 ++ vst1q_u32(ptr, val0); //store as 128 bit structure ++} ++#define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val) ++ ++//void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val) ++{ ++ *(ptr) = val->val[0].m64_u64[0]; ++ *(ptr + 1) = val->val[1].m64_u64[0]; ++ *(ptr + 2) = val->val[2].m64_u64[0]; ++} ++#define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val) ++ ++//void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0] ++#define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val) ++ ++//void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0] ++#define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val) ++ ++//void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] ++#define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val) ++ ++//void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0] ++#define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val) ++ ++//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] ++void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++//void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val) ++{ ++ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5 ++ *(ptr) = val->val[0].m64_f32[0]; ++ *(ptr + 1) = val->val[1].m64_f32[0]; ++ *(ptr + 2) = val->val[2].m64_f32[0]; ++ *(ptr + 3) = val->val[0].m64_f32[1]; ++ *(ptr + 4) = val->val[1].m64_f32[1]; ++ *(ptr + 5) = val->val[2].m64_f32[1]; ++} ++#define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val) ++ ++//void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0] ++void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); ++#define vst3_p8 vst3_u8 ++ ++//void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] ++void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); ++#define vst3_p16 vst3_s16 ++ ++//*************** Quadruples store ******************************** ++//********************************************************************* ++//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val) ++{ ++ __m128i tmp1, tmp2, res; ++ tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 ++ tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 ++ res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15 ++ vst1q_u8(ptr, res); ++ res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31 ++ vst1q_u8((ptr + 16), res); ++ tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); // ++ tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); // ++ res = _mm_unpacklo_epi16(tmp1, tmp2); // ++ vst1q_u8((ptr + 32), res); ++ res = _mm_unpackhi_epi16(tmp1, tmp2); // ++ vst1q_u8((ptr + 48), res); ++} ++#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val) ++ ++//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val) ++{ ++ uint16x8x4_t v; ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2); ++ v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2); ++ tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2); ++ v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2); ++ vst1q_u16(ptr, v.val[0]); ++ vst1q_u16((ptr + 8), v.val[1]); ++ vst1q_u16((ptr + 16),v.val[2]); ++ vst1q_u16((ptr + 24), v.val[3]); ++} ++#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val) ++ ++//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val) ++{ ++ uint16x8x4_t v; ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2); ++ v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2); ++ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2); ++ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2); ++ vst1q_u32(ptr, v.val[0]); ++ vst1q_u32((ptr + 4), v.val[1]); ++ vst1q_u32((ptr + 8), v.val[2]); ++ vst1q_u32((ptr + 12), v.val[3]); ++} ++#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val) ++ ++//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val); ++void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); ++#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val) ++ ++//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val); ++void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); ++#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val) ++ ++//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val); ++void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); ++#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val) ++ ++//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val) ++{ ++ __m128 tmp3, tmp2, tmp1, tmp0; ++ float32x4x4_t v; ++ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); ++ tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]); ++ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); ++ tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]); ++ v.val[0] = _mm_movelh_ps(tmp0, tmp2); ++ v.val[1] = _mm_movehl_ps(tmp2, tmp0); ++ v.val[2] = _mm_movelh_ps(tmp1, tmp3); ++ v.val[3] = _mm_movehl_ps(tmp3, tmp1); ++ vst1q_f32(ptr, v.val[0]); ++ vst1q_f32((ptr + 4), v.val[1]); ++ vst1q_f32((ptr + 8), v.val[2]); ++ vst1q_f32((ptr + 12), v.val[3]); ++} ++#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val) ++ ++//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); ++#define vst4q_p8 vst4q_u8 ++ ++//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); ++#define vst4q_p16 vst4q_s16 ++ ++//void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val) ++{ ++ __m128i sh0, sh1, val0, val2; ++ sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7, ++ sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7 ++ val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3, ++ val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7 ++ vst1q_u8(ptr, val0); ++ vst1q_u8((ptr + 16), val2); ++} ++#define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val) ++ ++//void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val) ++{ ++ __m128i sh0, sh1, val0, val2; ++ sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1, ++ sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3 ++ val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3 ++ val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3 ++ vst1q_u16(ptr, val0); //store as 128 bit structure ++ vst1q_u16((ptr + 8), val2); ++} ++#define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val) ++ ++//void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val) ++{ ++ //0,4, 1,5, 2,6, 3,7 ++ __m128i sh0, sh1, val0, val1; ++ sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5 ++ sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7 ++ val0 = _mm_unpacklo_epi64(sh0,sh1); // ++ val1 = _mm_unpackhi_epi64(sh0,sh1); // ++ vst1q_u32(ptr, val0); //store as 128 bit structure ++ vst1q_u32((ptr + 4), val1); ++} ++#define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val) ++ ++//void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val) ++{ ++ *(ptr) = val->val[0].m64_u64[0]; ++ *(ptr + 1) = val->val[1].m64_u64[0]; ++ *(ptr + 2) = val->val[2].m64_u64[0]; ++ *(ptr + 3) = val->val[3].m64_u64[0]; ++} ++#define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val) ++ ++//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0] ++#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val) ++ ++//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0] ++#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val) ++ ++//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0] ++#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val) ++ ++//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] ++void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); ++#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val) ++ ++//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++//void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val) ++{ ++ //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7 ++ *(ptr) = val->val[0].m64_f32[0]; ++ *(ptr + 1) = val->val[1].m64_f32[0]; ++ *(ptr + 2) = val->val[2].m64_f32[0]; ++ *(ptr + 3) = val->val[3].m64_f32[0]; ++ *(ptr + 4) = val->val[0].m64_f32[1]; ++ *(ptr + 5) = val->val[1].m64_f32[1]; ++ *(ptr + 6) = val->val[2].m64_f32[1]; ++ *(ptr + 7) = val->val[3].m64_f32[1]; ++} ++#define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val) ++ ++//void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); ++#define vst4_p8 vst4_u8 ++ ++//void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); ++#define vst4_p16 vst4_u16 ++ ++//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors ********************* ++//******************************************************************************************************************** ++//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane) ++{ ++ vst1q_lane_s16(ptr, val->val[0], lane); ++ vst1q_lane_s16((ptr + 1), val->val[1], lane); ++} ++#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_u32(ptr, val->val[0], lane); ++ vst1q_lane_u32((ptr + 1), val->val[1], lane); ++} ++#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] ++void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); ++#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane) ++ ++//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0] ++void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); ++#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane) ++ ++//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] ++void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_f32(ptr, val->val[0], lane); ++ vst1q_lane_f32((ptr + 1), val->val[1], lane); ++} ++#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane) ++ ++//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] ++void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); ++#define vst2q_lane_p16 vst2q_lane_s16 ++ ++//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] ++void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0] ++{ ++ *(ptr) = val->val[0].m64_u8[lane]; ++ *(ptr + 1) = val->val[1].m64_u8[lane]; ++} ++#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane) ++ ++//void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] ++void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val->val[0].m64_u16[lane]; ++ *(ptr + 1) = val->val[1].m64_u16[lane]; ++} ++#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] ++void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_u32[lane]; ++ *(ptr + 1) = val->val[1].m64_u32[lane]; ++} ++#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] ++void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); ++#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane) ++ ++//void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] ++void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); ++#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane) ++ ++//void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] ++void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); ++#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane) ++ ++//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_f32[lane]; ++ *(ptr + 1) = val->val[1].m64_f32[lane]; ++} ++#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane) ++ ++//void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] ++#define vst2_lane_p8 vst2_lane_u8 ++ ++//void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] ++#define vst2_lane_p16 vst2_lane_u16 ++ ++//************************* Triple lanes stores ******************************************************* ++//******************************************************************************************************* ++//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane) ++{ ++ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane); ++ vst1q_lane_u16((ptr + 2), val->val[2], lane); ++} ++#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane) ++{ ++ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane); ++ vst1q_lane_u32((ptr + 2), val->val[2], lane); ++} ++#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); ++#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane) ++ ++//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); ++#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane) ++ ++//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_f32(ptr, val->val[0], lane); ++ vst1q_lane_f32((ptr + 1), val->val[1], lane); ++ vst1q_lane_f32((ptr + 2), val->val[2], lane); ++} ++#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); ++#define vst3q_lane_p16 vst3q_lane_s16 ++ ++//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane) ++{ ++ *(ptr) = val->val[0].m64_u8[lane]; ++ *(ptr + 1) = val->val[1].m64_u8[lane]; ++ *(ptr + 2) = val->val[2].m64_u8[lane]; ++} ++#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane) ++ ++//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val->val[0].m64_u16[lane]; ++ *(ptr + 1) = val->val[1].m64_u16[lane]; ++ *(ptr + 2) = val->val[2].m64_u16[lane]; ++} ++#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_u32[lane]; ++ *(ptr + 1) = val->val[1].m64_u32[lane]; ++ *(ptr + 2) = val->val[2].m64_u32[lane]; ++} ++#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); ++#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane) ++ ++//void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); ++#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane) ++ ++//void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); ++#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane) ++ ++//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); ++_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_f32[lane]; ++ *(ptr + 1) = val->val[1].m64_f32[lane]; ++ *(ptr + 2) = val->val[2].m64_f32[lane]; ++} ++#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); ++#define vst3_lane_p8 vst3_lane_u8 ++ ++//void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); ++#define vst3_lane_p16 vst3_lane_s16 ++ ++//******************************** Quadruple lanes stores *********************************************** ++//******************************************************************************************************* ++//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane) ++{ ++ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane); ++ vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane); ++} ++#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane) ++{ ++ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane); ++ vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane); ++} ++#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); ++#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane) ++ ++//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); ++#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane) ++ ++//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_f32(ptr, val->val[0], lane); ++ vst1q_lane_f32((ptr + 1), val->val[1], lane); ++ vst1q_lane_f32((ptr + 2), val->val[2], lane); ++ vst1q_lane_f32((ptr + 3), val->val[3], lane); ++} ++#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); ++#define vst4q_lane_p16 vst4q_lane_u16 ++ ++//void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane) ++{ ++ *(ptr) = val->val[0].m64_u8[lane]; ++ *(ptr + 1) = val->val[1].m64_u8[lane]; ++ *(ptr + 2) = val->val[2].m64_u8[lane]; ++ *(ptr + 3) = val->val[3].m64_u8[lane]; ++} ++#define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane) ++ ++//void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val->val[0].m64_u16[lane]; ++ *(ptr + 1) = val->val[1].m64_u16[lane]; ++ *(ptr + 2) = val->val[2].m64_u16[lane]; ++ *(ptr + 3) = val->val[3].m64_u16[lane]; ++} ++#define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_u32[lane]; ++ *(ptr + 1) = val->val[1].m64_u32[lane]; ++ *(ptr + 2) = val->val[2].m64_u32[lane]; ++ *(ptr + 3) = val->val[3].m64_u32[lane]; ++} ++#define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane) ++ ++//void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane) ++ ++//void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane) ++ ++//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); ++//current IA SIMD doesn't support float16 ++ ++void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_f32[lane]; ++ *(ptr + 1) = val->val[1].m64_f32[lane]; ++ *(ptr + 2) = val->val[2].m64_f32[lane]; ++ *(ptr + 3) = val->val[3].m64_f32[lane]; ++} ++#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); ++#define vst4_lane_p8 vst4_lane_u8 ++ ++//void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); ++#define vst4_lane_p16 vst4_lane_u16 ++ ++//************************************************************************************************** ++//************************ Extract lanes from a vector ******************************************** ++//************************************************************************************************** ++//These intrinsics extract a single lane (element) from a vector. ++uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++#define vget_lane_u8(vec, lane) vec.m64_u8[lane] ++ ++uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] ++#define vget_lane_u16(vec, lane) vec.m64_u16[lane] ++ ++ ++uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++#define vget_lane_u32(vec, lane) vec.m64_u32[lane] ++ ++int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] ++#define vget_lane_s8(vec, lane) vec.m64_i8[lane] ++ ++int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] ++#define vget_lane_s16(vec, lane) vec.m64_i16[lane] ++ ++int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++#define vget_lane_s32(vec, lane) vec.m64_i32[lane] ++ ++poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++#define vget_lane_p8 vget_lane_u8 ++ ++poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] ++#define vget_lane_p16 vget_lane_u16 ++ ++float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++#define vget_lane_f32(vec, lane) vec.m64_f32[lane] ++ ++uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++#define vgetq_lane_u8 _MM_EXTRACT_EPI8 ++ ++uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] ++#define vgetq_lane_u16 _MM_EXTRACT_EPI16 ++ ++uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++#define vgetq_lane_u32 _MM_EXTRACT_EPI32 ++ ++int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] ++#define vgetq_lane_s8 vgetq_lane_u8 ++ ++int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] ++#define vgetq_lane_s16 vgetq_lane_u16 ++ ++int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++#define vgetq_lane_s32 vgetq_lane_u32 ++ ++poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++#define vgetq_lane_p8 vgetq_lane_u8 ++ ++poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] ++#define vgetq_lane_p16 vgetq_lane_u16 ++ ++float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane) ++{ ++ int32_t ilane; ++ ilane = _MM_EXTRACT_PS(vec,lane); ++ return *(float*)&ilane; ++} ++ ++int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++#define vget_lane_s64(vec, lane) vec.m64_i64[0] ++ ++uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++#define vget_lane_u64(vec, lane) vec.m64_u64[0] ++ ++ ++int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++#define vgetq_lane_s64 (int64_t) vgetq_lane_u64 ++ ++uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++#define vgetq_lane_u64 _MM_EXTRACT_EPI64 ++ ++// ***************** Set lanes within a vector ******************************************** ++// ************************************************************************************** ++//These intrinsics set a single lane (element) within a vector. ++//same functions as vld1_lane_xx ones, but take the value to be set directly. ++ ++uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane) ++{ ++ uint8_t val; ++ val = value; ++ return vld1_lane_u8(&val, vec, lane); ++} ++ ++uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane) ++{ ++ uint16_t val; ++ val = value; ++ return vld1_lane_u16(&val, vec, lane); ++} ++ ++uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane) ++{ ++ uint32_t val; ++ val = value; ++ return vld1_lane_u32(&val, vec, lane); ++} ++ ++int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane) ++{ ++ int8_t val; ++ val = value; ++ return vld1_lane_s8(&val, vec, lane); ++} ++ ++int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane) ++{ ++ int16_t val; ++ val = value; ++ return vld1_lane_s16(&val, vec, lane); ++} ++ ++int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane) ++{ ++ int32_t val; ++ val = value; ++ return vld1_lane_s32(&val, vec, lane); ++} ++ ++poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++#define vset_lane_p8 vset_lane_u8 ++ ++poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++#define vset_lane_p16 vset_lane_u16 ++ ++float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane) ++{ ++ float32_t val; ++ val = value; ++ return vld1_lane_f32(&val, vec, lane); ++} ++ ++uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane) ++{ ++ uint8_t val; ++ val = value; ++ return vld1q_lane_u8(&val, vec, lane); ++} ++ ++uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane) ++{ ++ uint16_t val; ++ val = value; ++ return vld1q_lane_u16(&val, vec, lane); ++} ++ ++uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane) ++{ ++ uint32_t val; ++ val = value; ++ return vld1q_lane_u32(&val, vec, lane); ++} ++ ++int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane) ++{ ++ int8_t val; ++ val = value; ++ return vld1q_lane_s8(&val, vec, lane); ++} ++ ++int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane) ++{ ++ int16_t val; ++ val = value; ++ return vld1q_lane_s16(&val, vec, lane); ++} ++ ++int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane) ++{ ++ int32_t val; ++ val = value; ++ return vld1q_lane_s32(&val, vec, lane); ++} ++ ++poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++#define vsetq_lane_p8 vsetq_lane_u8 ++ ++poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++#define vsetq_lane_p16 vsetq_lane_u16 ++ ++float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane) ++{ ++ float32_t val; ++ val = value; ++ return vld1q_lane_f32(&val, vec, lane); ++} ++ ++int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane) ++{ ++ int64_t val; ++ val = value; ++ return vld1_lane_s64(&val, vec, lane); ++} ++ ++uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane) ++{ ++ uint64_t val; ++ val = value; ++ return vld1_lane_u64(&val, vec, lane); ++} ++ ++int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane) ++{ ++ uint64_t val; ++ val = value; ++ return vld1q_lane_s64(&val, vec, lane); ++} ++ ++uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++#define vsetq_lane_u64 vsetq_lane_s64 ++ ++// ******************************************************************************* ++// **************** Initialize a vector from bit pattern *************************** ++// ******************************************************************************* ++//These intrinsics create a vector from a literal bit pattern. ++int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s8(a) (*(__m64_128*)&(a)) ++ ++ ++int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s16 vcreate_s8 ++ ++int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s32 vcreate_s8 ++ ++float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 ++//no IA32 SIMD avalilable ++ ++float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_f32(a) (*(__m64_128*)&(a)) ++ ++uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u8 vcreate_s8 ++ ++uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u16 vcreate_s16 ++ ++uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u32 vcreate_s32 ++ ++uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u64 vcreate_s8 ++ ++ ++poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_p8 vcreate_u8 ++ ++poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_p16 vcreate_u16 ++ ++int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s64 vcreate_u64 ++ ++//********************* Set all lanes to same value ******************************** ++//********************************************************************************* ++//These intrinsics set all lanes to the same value. ++uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint8x8_t res; ++ int i; ++ for (i = 0; i<8; i++) { ++ res.m64_u8[i] = value; ++ } ++ return res; ++} ++ ++uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint16x4_t res; ++ int i; ++ for (i = 0; i<4; i++) { ++ res.m64_u16[i] = value; ++ } ++ return res; ++} ++ ++uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = value; ++ res.m64_u32[1] = value; ++ return res; ++} ++ ++int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int8x8_t res; ++ int i; ++ for (i = 0; i<8; i++) { ++ res.m64_i8[i] = value; ++ } ++ return res; ++} ++ ++int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int16x4_t res; ++ int i; ++ for (i = 0; i<4; i++) { ++ res.m64_i16[i] = value; ++ } ++ return res; ++} ++ ++int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t res; ++ res.m64_i32[0] = value; ++ res.m64_i32[1] = value; ++ return res; ++} ++ ++poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 ++#define vdup_n_p8 vdup_n_u8 ++ ++poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 ++#define vdup_n_p16 vdup_n_s16 ++ ++float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 ++_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = value; ++ res.m64_f32[1] = value; ++ return res; ++} ++ ++uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value)) ++ ++uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value)) ++ ++uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value)) ++ ++int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 ++#define vdupq_n_s8 _mm_set1_epi8 ++ ++int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 ++#define vdupq_n_s16 _mm_set1_epi16 ++ ++int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 ++#define vdupq_n_s32 _mm_set1_epi32 ++ ++poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++#define vdupq_n_p8 vdupq_n_u8 ++ ++poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++#define vdupq_n_p16 vdupq_n_u16 ++ ++float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 ++#define vdupq_n_f32 _mm_set1_ps ++ ++int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = value; ++ return res; ++} ++ ++uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = value; ++ return res; ++} ++ ++int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value) ++{ ++ _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate ++ return LOAD_SI128(value2); ++} ++ ++uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate ++ return LOAD_SI128(val); ++} ++ ++//**** Set all lanes to same value ************************ ++//Same functions as above - just aliaces.******************** ++//Probably they reflect the fact that 128-bit functions versions use VMOV instruction ********** ++uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 ++#define vmov_n_u8 vdup_n_s8 ++ ++uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 ++#define vmov_n_u16 vdup_n_s16 ++ ++uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 ++#define vmov_n_u32 vdup_n_u32 ++ ++int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 ++#define vmov_n_s8 vdup_n_s8 ++ ++int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 ++#define vmov_n_s16 vdup_n_s16 ++ ++int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 ++#define vmov_n_s32 vdup_n_s32 ++ ++poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 ++#define vmov_n_p8 vdup_n_u8 ++ ++poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 ++#define vmov_n_p16 vdup_n_s16 ++ ++float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 ++#define vmov_n_f32 vdup_n_f32 ++ ++uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++#define vmovq_n_u8 vdupq_n_u8 ++ ++uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++#define vmovq_n_u16 vdupq_n_s16 ++ ++uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++#define vmovq_n_u32 vdupq_n_u32 ++ ++int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 ++#define vmovq_n_s8 vdupq_n_s8 ++ ++int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 ++#define vmovq_n_s16 vdupq_n_s16 ++ ++int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 ++#define vmovq_n_s32 vdupq_n_s32 ++ ++poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++#define vmovq_n_p8 vdupq_n_u8 ++ ++poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++#define vmovq_n_p16 vdupq_n_s16 ++ ++float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 ++#define vmovq_n_f32 vdupq_n_f32 ++ ++int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 ++#define vmov_n_s64 vdup_n_s64 ++ ++uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 ++#define vmov_n_u64 vdup_n_u64 ++ ++int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 ++#define vmovq_n_s64 vdupq_n_s64 ++ ++uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++#define vmovq_n_u64 vdupq_n_u64 ++ ++//**************Set all lanes to the value of one lane of a vector ************* ++//**************************************************************************** ++//here shuffle is better solution than lane extraction followed by set1 function ++uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) ++{ ++ uint8x8_t res; ++ uint8_t valane; ++ int i = 0; ++ valane = vec.m64_u8[lane]; ++ for (i = 0; i<8; i++) { ++ res.m64_u8[i] = valane; ++ } ++ return res; ++} ++ ++uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) ++{ ++ uint16x4_t res; ++ uint16_t valane; ++ valane = vec.m64_u16[lane]; ++ res.m64_u16[0] = valane; ++ res.m64_u16[1] = valane; ++ res.m64_u16[2] = valane; ++ res.m64_u16[3] = valane; ++ return res; ++} ++ ++uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = vec.m64_u32[lane]; ++ res.m64_u32[1] = res.m64_u32[0]; ++ return res; ++} ++ ++int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++#define vdup_lane_s8 vdup_lane_u8 ++ ++int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++#define vdup_lane_s16 vdup_lane_u16 ++ ++int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++#define vdup_lane_s32 vdup_lane_u32 ++ ++poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++#define vdup_lane_p8 vdup_lane_u8 ++ ++poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++#define vdup_lane_p16 vdup_lane_s16 ++ ++float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = vec.m64_f32[lane]; ++ res.m64_f32[1] = res.m64_f32[0]; ++ return res; ++} ++ ++uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0] ++{ ++ _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane}; ++ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8); ++} ++ ++uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0] ++{ ++ //we could use 8bit shuffle for 16 bit as well ++ const int8_t lane16 = ((int8_t) lane) << 1; ++ _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, ++ lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1}; ++ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16); ++} ++ ++uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++#define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6)) ++ ++int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++#define vdupq_lane_s8 vdupq_lane_u8 ++ ++int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++#define vdupq_lane_s16 vdupq_lane_u16 ++ ++int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++#define vdupq_lane_s32 vdupq_lane_u32 ++ ++poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++#define vdupq_lane_p8 vdupq_lane_u8 ++ ++poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++#define vdupq_lane_p16 vdupq_lane_s16 ++ ++float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane)) ++ ++int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++#define vdup_lane_s64(vec,lane) vec ++ ++uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++#define vdup_lane_u64(vec,lane) vec ++ ++int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane) ++{ ++ __m128i vec128; ++ vec128 = _pM128i(vec); ++ return _mm_unpacklo_epi64(vec128,vec128); ++} ++ ++uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++#define vdupq_lane_u64 vdupq_lane_s64 ++ ++// ******************************************************************** ++// ******************** Combining vectors ***************************** ++// ******************************************************************** ++//These intrinsics join two 64 bit vectors into a single 128bit vector. ++int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 ++#define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 ++#define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 ++#define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 ++#define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 ++//current IA SIMD doesn't support float16 ++ ++float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 ++_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high) ++{ ++ __m128i res; ++ res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) ); ++ return _M128(res); ++} ++ ++uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 ++#define vcombine_u8 vcombine_s8 ++ ++uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 ++#define vcombine_u16 vcombine_s16 ++ ++uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 ++#define vcombine_u32 vcombine_s32 ++ ++uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 ++#define vcombine_u64 vcombine_s64 ++ ++poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 ++#define vcombine_p8 vcombine_u8 ++ ++poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 ++#define vcombine_p16 vcombine_u16 ++ ++//********************************************************************** ++//************************* Splitting vectors ************************** ++//********************************************************************** ++//**************** Get high part ****************************************** ++//These intrinsics split a 128 bit vector into 2 component 64 bit vectors ++int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a) ++{ ++ int64x1_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a) ++{ ++ __m128i res; ++ __m64_128 res64; ++ res = _mm_unpackhi_epi64(_M128i(a),_M128i(a)); ++ return64(res); ++} ++ ++uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 ++#define vget_high_u8 vget_high_s8 ++ ++uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 ++#define vget_high_u16 vget_high_s16 ++ ++uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 ++#define vget_high_u32 vget_high_s32 ++ ++uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 ++#define vget_high_u64 vget_high_s64 ++ ++poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 ++#define vget_high_p8 vget_high_u8 ++ ++poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 ++#define vget_high_p16 vget_high_u16 ++ ++//********************** Get low part ********************** ++//********************************************************** ++int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0 ++{ ++ int16x4_t res64; ++ return64(a); ++} ++ ++int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0 ++{ ++ int16x4_t res64; ++ return64(a); ++} ++ ++int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0 ++{ ++ int32x2_t res64; ++ return64(a); ++} ++ ++int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0 ++{ ++ int64x1_t res64; ++ return64 (a); ++} ++ ++float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a) ++{ ++ float32x2_t res64; ++ _M64f(res64, a); ++ return res64; ++} ++ ++uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 ++#define vget_low_u8 vget_low_s8 ++ ++uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 ++#define vget_low_u16 vget_low_s16 ++ ++uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 ++#define vget_low_u32 vget_low_s32 ++ ++uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 ++#define vget_low_u64 vget_low_s64 ++ ++poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 ++#define vget_low_p8 vget_low_u8 ++ ++poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 ++#define vget_low_p16 vget_low_s16 ++ ++//************************************************************************** ++//************************ Converting vectors ********************************** ++//************************************************************************** ++//************* Convert from float *************************************** ++// need to set _MM_SET_ROUNDING_MODE ( x) accordingly ++int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 ++_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only ++ return64(res); ++} ++ ++uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a) ++{ ++ //may be not effective compared with a serial SIMD solution ++ uint32x2_t res64; ++ __m128i res; ++ res = vcvtq_u32_f32(_pM128(a)); ++ return64(res); ++} ++ ++int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 ++#define vcvtq_s32_f32 _mm_cvttps_epi32 ++ ++uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0 ++{ ++ //No single instruction SSE solution but we could implement it as following: ++ __m128i resi; ++ __m128 zero, mask, a_pos, mask_f_max_si, res; ++ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; ++ zero = _mm_setzero_ps(); ++ mask = _mm_cmpgt_ps(a, zero); ++ a_pos = _mm_and_ps(a, mask); ++ mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff); ++ res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything ++ resi = _mm_cvttps_epi32(res); ++ return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si); ++} ++ ++// ***** Convert to the fixed point with the number of fraction bits specified by b *********** ++//************************************************************************************************* ++int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 ++_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ return64(vcvtq_n_s32_f32(_pM128(a),b)); ++} ++ ++uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 ++_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res; ++ float convconst; ++ convconst = (float)((uint32_t)1 << b); ++ res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst); ++ res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst); ++ return res; ++} ++ ++int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 ++_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ __m128 cconst128; ++ __m128i mask, res; ++ convconst = (float)(1 << b); ++ cconst128 = vdupq_n_f32(convconst); ++ res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128)); ++ mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask); ++ return _mm_xor_si128 (res, mask); //res saturated for 0x80000000 ++} ++ ++uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 ++_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ __m128 cconst128; ++ convconst = (float)(1 << b); ++ cconst128 = vdupq_n_f32(convconst); ++ return vcvtq_u32_f32(_mm_mul_ps(a,cconst128)); ++} ++ ++//***************** Convert to float ************************* ++//************************************************************* ++float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 ++_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits ++{ ++ float32x2_t res; ++ res.m64_f32[0] = (float) a.m64_i32[0]; ++ res.m64_f32[1] = (float) a.m64_i32[1]; ++ return res; ++} ++ ++float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 ++_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = (float) a.m64_u32[0]; ++ res.m64_f32[1] = (float) a.m64_u32[1]; ++ return res; ++} ++ ++float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 ++#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a) ++ ++float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 ++_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0 ++{ ++ //solution may be not optimal ++ __m128 two16, fHi, fLo; ++ __m128i hi, lo; ++ two16 = _mm_set1_ps((float)0x10000); //2^16 ++ // Avoid double rounding by doing two exact conversions ++ // of high and low 16-bit segments ++ hi = _mm_srli_epi32(a, 16); ++ lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16); ++ fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16); ++ fLo = _mm_cvtepi32_ps(lo); ++ // do single rounding according to current rounding mode ++ return _mm_add_ps(fHi, fLo); ++} ++ ++// ***** Convert to the float from fixed point with the number of fraction bits specified by b *********** ++float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 ++_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b) ++{ ++ float32x2_t res; ++ float convconst; ++ convconst = (float)(1. / ((uint32_t)1 << b)); ++ res.m64_f32[0] = a.m64_i32[0] * convconst; ++ res.m64_f32[1] = a.m64_i32[1] * convconst; ++ return res; ++} ++ ++float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 ++_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32 ++{ ++ float32x2_t res; ++ float convconst; ++ convconst = (float)(1. / ((uint32_t)1 << b)); ++ res.m64_f32[0] = a.m64_u32[0] * convconst; ++ res.m64_f32[1] = a.m64_u32[1] * convconst; ++ return res; ++} ++ ++float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 ++_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ __m128 cconst128, af; ++ convconst = (float)(1. / ((uint32_t)1 << b)); ++ af = _mm_cvtepi32_ps(a); ++ cconst128 = vdupq_n_f32(convconst); ++ return _mm_mul_ps(af,cconst128); ++} ++ ++float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 ++_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ __m128 cconst128, af; ++ convconst = (float)(1. / (1 << b)); ++ af = vcvtq_f32_u32(a); ++ cconst128 = vdupq_n_f32(convconst); ++ return _mm_mul_ps(af,cconst128); ++} ++ ++//**************Convert between floats *********************** ++//************************************************************ ++float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 ++//Intel SIMD doesn't support 16bits floats curently ++ ++float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 ++//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits ++ ++//************Vector narrow integer conversion (truncation) ****************** ++//**************************************************************************** ++int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 ++_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0 ++{ ++ int8x8_t res64; ++ __m128i res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only ++ return64(res); ++} ++ ++int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 ++_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0 ++{ ++ int16x4_t res64; ++ __m128i res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; ++ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only ++ return64(res); ++} ++ ++int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 ++_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a) ++{ ++ //may be not effective compared with a serial implementation ++ int32x2_t res64; ++ __m128i res; ++ res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0) ++ return64(res); ++} ++ ++uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 ++#define vmovn_u16 vmovn_s16 ++ ++uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 ++#define vmovn_u32 vmovn_s32 ++ ++uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 ++#define vmovn_u64 vmovn_s64 ++ ++//**************** Vector long move *********************** ++//*********************************************************** ++int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 ++#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1 ++ ++int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 ++#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1 ++ ++int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 ++#define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1 ++ ++uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 ++#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1 ++ ++uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0 ++#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1 ++ ++uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 ++#define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1 ++ ++//*************Vector saturating narrow integer***************** ++//************************************************************** ++int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 ++_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _mm_packs_epi16(a, a); ++ return64(res); ++} ++ ++int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 ++_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = _mm_packs_epi32(a, a); ++ return64(res); ++} ++ ++int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution ++{ ++ int32x2_t res; ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX; ++ if(atmp[0]SINT_MAX) atmp[1] = SINT_MAX; ++ if(atmp[1]32 bits numbers ++ res_hi = _mm_or_si128(a, mask); ++ res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(res_hi); ++} ++//************* Vector saturating narrow integer signed->unsigned ************** ++//***************************************************************************** ++uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 ++_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a) ++{ ++ uint8x8_t res64; ++ __m128i res; ++ res = _mm_packus_epi16(a, a); //use low 64bits only ++ return64(res); ++} ++ ++uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 ++_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a) ++{ ++ uint16x4_t res64; ++ __m128i res; ++ res = _MM_PACKUS1_EPI32(a); //use low 64bits only ++ return64(res); ++} ++ ++uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 ++_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a) ++{ ++ uint32x2_t res64; ++ __m128i res_hi,res_lo, zero, cmp; ++ zero = _mm_setzero_si128(); ++ res_hi = _mm_srli_epi64(a, 32); ++ cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero ++ res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 ++ cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive ++ res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff ++ res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(res_lo); ++} ++ ++// ******************************************************** ++// **************** Table look up ************************** ++// ******************************************************** ++//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values ++//in a table and generate a new vector. Indexes out of range return 0. ++//for Intel SIMD we need to set the MSB to 1 for zero return ++uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ __m128i c7, maskgt, bmask, b128; ++ c7 = _mm_set1_epi8 (7); ++ b128 = _pM128i(b); ++ maskgt = _mm_cmpgt_epi8(b128,c7); ++ bmask = _mm_or_si128(b128,maskgt); ++ bmask = _mm_shuffle_epi8(_pM128i(a),bmask); ++ return64(bmask); ++} ++ ++int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 ++#define vtbl1_s8 vtbl1_u8 ++ ++poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++#define vtbl1_p8 vtbl1_u8 ++ ++//Special trick to avoid __declspec(align('8')) won't be aligned" error ++//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ __m128i c15, a01, maskgt15, bmask, b128; ++ c15 = _mm_set1_epi8 (15); ++ b128 = _pM128i(b); ++ maskgt15 = _mm_cmpgt_epi8(b128,c15); ++ bmask = _mm_or_si128(b128, maskgt15); ++ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1])); ++ a01 = _mm_shuffle_epi8(a01, bmask); ++ return64(a01); ++} ++#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b) ++ ++//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++#define vtbl2_s8 vtbl2_u8 ++ ++//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++#define vtbl2_p8 vtbl2_u8 ++ ++//Special trick to avoid __declspec(align('16')) won't be aligned" error ++//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128; ++ c15 = _mm_set1_epi8 (15); ++ c23 = _mm_set1_epi8 (23); ++ b128 = _pM128i(b); ++ maskgt23 = _mm_cmpgt_epi8(b128,c23); ++ bmask = _mm_or_si128(b128, maskgt23); ++ maskgt15 = _mm_cmpgt_epi8(b128,c15); ++ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); ++ sh0 = _mm_shuffle_epi8(a01, bmask); ++ sh1 = _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1 ++ return64(sh0); ++} ++#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b) ++ ++//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++#define vtbl3_s8 vtbl3_u8 ++ ++//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++#define vtbl3_p8 vtbl3_u8 ++ ++//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128; ++ c15 = _mm_set1_epi8 (15); ++ c31 = _mm_set1_epi8 (31); ++ b128 = _pM128i(b); ++ maskgt31 = _mm_cmpgt_epi8(b128,c31); ++ bmask = _mm_or_si128(b128, maskgt31); ++ maskgt15 = _mm_cmpgt_epi8(b128,c15); ++ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); ++ a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3])); ++ sh0 = _mm_shuffle_epi8(a01, bmask); ++ sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1 ++ return64(sh0); ++} ++#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b) ++ ++//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++#define vtbl4_s8 vtbl4_u8 ++ ++//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++#define vtbl4_p8 vtbl4_u8 ++ ++//****************** Extended table look up intrinsics *************************** ++//********************************************************************************** ++//VTBX (Vector Table Extension) works in the same way as VTBL do, ++// except that indexes out of range leave the destination element unchanged. ++ ++uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) ++{ ++ uint8x8_t res64; ++ __m128i c7, maskgt, sh, c128; ++ c7 = _mm_set1_epi8 (7); ++ c128 = _pM128i(c); ++ maskgt = _mm_cmpgt_epi8(c128,c7); ++ c7 = _mm_and_si128(maskgt,_pM128i(a)); ++ sh = _mm_shuffle_epi8(_pM128i(b),c128); ++ sh = _mm_andnot_si128(maskgt,sh); ++ sh = _mm_or_si128(sh,c7); ++ return64(sh); ++} ++ ++int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 ++#define vtbx1_s8 vtbx1_u8 ++ ++poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++#define vtbx1_p8 vtbx1_u8 ++ ++//Special trick to avoid __declspec(align('8')) won't be aligned" error ++//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c) ++{ ++ uint8x8_t res64; ++ __m128i c15, b01, maskgt15, sh, c128; ++ c15 = _mm_set1_epi8 (15); ++ c128 = _pM128i(c); ++ maskgt15 = _mm_cmpgt_epi8(c128, c15); ++ c15 = _mm_and_si128(maskgt15, _pM128i(a)); ++ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1])); ++ sh = _mm_shuffle_epi8(b01, c128); ++ sh = _mm_andnot_si128(maskgt15, sh); ++ sh = _mm_or_si128(sh,c15); ++ return64(sh); ++} ++#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c) ++ ++//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++#define vtbx2_s8 vtbx2_u8 ++ ++//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++#define vtbx2_p8 vtbx2_u8 ++ ++//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128; ++ c15 = _mm_set1_epi8 (15); ++ c23 = _mm_set1_epi8 (23); ++ c128 = _pM128i(c); ++ maskgt15 = _mm_cmpgt_epi8(c128,c15); ++ maskgt23 = _mm_cmpgt_epi8(c128,c23); ++ c23 = _mm_and_si128(maskgt23, _pM128i(a)); ++ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); ++ sh0 = _mm_shuffle_epi8(b01, c128); ++ sh1 = _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); ++ sh0 = _mm_andnot_si128(maskgt23,sh0); ++ sh0 = _mm_or_si128(sh0,c23); ++ return64(sh0); ++} ++#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c) ++ ++//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c); ++#define vtbx3_s8 vtbx3_u8 ++ ++//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c); ++#define vtbx3_p8 vtbx3_u8 ++ ++//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128; ++ c15 = _mm_set1_epi8 (15); ++ c31 = _mm_set1_epi8 (31); ++ c128 = _pM128i(c); ++ maskgt15 = _mm_cmpgt_epi8(c128,c15); ++ maskgt31 = _mm_cmpgt_epi8(c128,c31); ++ c31 = _mm_and_si128(maskgt31, _pM128i(a)); ++ ++ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); ++ b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3])); ++ sh0 = _mm_shuffle_epi8(b01, c128); ++ sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); ++ sh0 = _mm_andnot_si128(maskgt31,sh0); ++ sh0 = _mm_or_si128(sh0,c31); ++ return64(sh0); ++} ++#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c) ++ ++//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c); ++#define vtbx4_s8 vtbx4_u8 ++ ++//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c); ++#define vtbx4_p8 vtbx4_u8 ++ ++//************************************************************************************************* ++// *************************** Operations with a scalar value ********************************* ++//************************************************************************************************* ++ ++//******* Vector multiply accumulate by scalar ************************************************* ++//********************************************************************************************** ++int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0] ++{ ++ int16_t c; ++ int16x4_t scalar; ++ c = vget_lane_s16(v, l); ++ scalar = vdup_n_s16(c); ++ return vmla_s16(a, b, scalar); ++} ++ ++int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0] ++{ ++ int32_t c; ++ int32x2_t scalar; ++ c = vget_lane_s32(v, l); ++ scalar = vdup_n_s32(c); ++ return vmla_s32(a, b, scalar); ++} ++ ++uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] ++#define vmla_lane_u16 vmla_lane_s16 ++ ++ ++uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] ++#define vmla_lane_u32 vmla_lane_s32 ++ ++float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) ++{ ++ float32_t vlane; ++ float32x2_t c; ++ vlane = vget_lane_f32(v, l); ++ c = vdup_n_f32(vlane); ++ return vmla_f32(a,b,c); ++} ++ ++int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] ++{ ++ int16_t vlane; ++ int16x8_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdupq_n_s16(vlane); ++ return vmlaq_s16(a,b,c); ++} ++ ++int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] ++{ ++ int32_t vlane; ++ int32x4_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdupq_n_s32(vlane); ++ return vmlaq_s32(a,b,c); ++} ++ ++uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] ++#define vmlaq_lane_u16 vmlaq_lane_s16 ++ ++uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] ++#define vmlaq_lane_u32 vmlaq_lane_s32 ++ ++float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] ++{ ++ float32_t vlane; ++ float32x4_t c; ++ vlane = vget_lane_f32(v, l); ++ c = vdupq_n_f32(vlane); ++ return vmlaq_f32(a,b,c); ++} ++ ++//***************** Vector widening multiply accumulate by scalar ********************** ++//*************************************************************************************** ++int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmlal_s16(a, b, c); ++} ++ ++int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vmlal_s32(a, b, c); ++} ++ ++uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t c; ++ vlane = vget_lane_u16(v, l); ++ c = vdup_n_u16(vlane); ++ return vmlal_u16(a, b, c); ++} ++ ++uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdup_n_u32(vlane); ++ return vmlal_u32(a, b, c); ++} ++ ++// ******** Vector widening saturating doubling multiply accumulate by scalar ******************************* ++// ************************************************************************************************ ++int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vqdmlal_s16(a, b, c); ++} ++ ++int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) ++{ ++ int32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vqdmlal_s32(a, b, c); ++} ++ ++// ****** Vector multiply subtract by scalar ***************** ++// ************************************************************* ++int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmls_s16(a, b, c); ++} ++ ++int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vmls_s32(a, b, c); ++} ++ ++uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmls_s16(a, b, c); ++} ++ ++uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdup_n_u32(vlane); ++ return vmls_u32(a, b, c); ++} ++ ++float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) ++{ ++ float32_t vlane; ++ float32x2_t c; ++ vlane = (float) vget_lane_f32(v, l); ++ c = vdup_n_f32(vlane); ++ return vmls_f32(a,b,c); ++} ++ ++int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0] ++{ ++ int16_t vlane; ++ int16x8_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdupq_n_s16(vlane); ++ return vmlsq_s16(a, b,c); ++} ++ ++int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0] ++{ ++ int32_t vlane; ++ int32x4_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdupq_n_s32(vlane); ++ return vmlsq_s32(a,b,c); ++} ++ ++uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x8_t c; ++ vlane = vget_lane_u16(v, l); ++ c = vdupq_n_u16(vlane); ++ return vmlsq_u16(a,b,c); ++} ++ ++uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x4_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdupq_n_u32(vlane); ++ return vmlsq_u32(a,b,c); ++} ++ ++float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] ++{ ++ float32_t vlane; ++ float32x4_t c; ++ vlane = (float) vget_lane_f32(v, l); ++ c = vdupq_n_f32(vlane); ++ return vmlsq_f32(a,b,c); ++} ++ ++// **** Vector widening multiply subtract by scalar **** ++// **************************************************** ++int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmlsl_s16(a, b, c); ++} ++ ++int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vmlsl_s32(a, b, c); ++} ++ ++uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmlsl_s16(a, b, c); ++} ++ ++uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdup_n_u32(vlane); ++ return vmlsl_u32(a, b, c); ++} ++ ++//********* Vector widening saturating doubling multiply subtract by scalar ************************** ++//****************************************************************************************************** ++int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vqdmlsl_s16(a, b, c); ++} ++ ++int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vqdmlsl_s32(a, b, c); ++} ++//********** Vector multiply with scalar ***************************** ++int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0] ++{ ++ int16x4_t b16x4; ++ b16x4 = vdup_n_s16(b); ++ return vmul_s16(a, b16x4); ++} ++ ++int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] ++_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0] ++{ ++ //serial solution looks faster ++ int32x2_t b32x2; ++ b32x2 = vdup_n_s32(b); ++ return vmul_s32(a, b32x2); ++} ++ ++float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] ++_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0] ++{ ++ float32x2_t b32x2; ++ b32x2 = vdup_n_f32(b); ++ return vmul_f32(a, b32x2); ++} ++ ++uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] ++_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0] ++{ ++ uint16x4_t b16x4; ++ b16x4 = vdup_n_s16(b); ++ return vmul_s16(a, b16x4); ++} ++ ++uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] ++_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0] ++{ ++ //serial solution looks faster ++ uint32x2_t b32x2; ++ b32x2 = vdup_n_u32(b); ++ return vmul_u32(a, b32x2); ++} ++ ++int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0] ++{ ++ int16x8_t b16x8; ++ b16x8 = vdupq_n_s16(b); ++ return vmulq_s16(a, b16x8); ++} ++ ++int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] ++_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0] ++{ ++ int32x4_t b32x4; ++ b32x4 = vdupq_n_s32(b); ++ return vmulq_s32(a, b32x4); ++} ++ ++float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] ++_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0] ++{ ++ float32x4_t b32x4; ++ b32x4 = vdupq_n_f32(b); ++ return vmulq_f32(a, b32x4); ++} ++ ++uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] ++_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0] ++{ ++ uint16x8_t b16x8; ++ b16x8 = vdupq_n_s16(b); ++ return vmulq_s16(a, b16x8); ++} ++ ++uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] ++_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0] ++{ ++ uint32x4_t b32x4; ++ b32x4 = vdupq_n_u32(b); ++ return vmulq_u32(a, b32x4); ++} ++ ++//********** Vector multiply lane ***************************** ++int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); ++_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c) ++{ ++ int16x4_t b16x4; ++ int16_t vlane; ++ vlane = vget_lane_s16(b, c); ++ b16x4 = vdup_n_s16(vlane); ++ return vmul_s16(a, b16x4); ++} ++ ++int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c) ++{ ++ int32x2_t b32x2; ++ int32_t vlane; ++ vlane = vget_lane_s32(b, c); ++ b32x2 = vdup_n_s32(vlane); ++ return vmul_s32(a, b32x2); ++} ++ ++float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c) ++{ ++ float32x2_t b32x2; ++ float32_t vlane; ++ vlane = vget_lane_f32(b, c); ++ b32x2 = vdup_n_f32(vlane); ++ return vmul_f32(a, b32x2); ++} ++ ++uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); ++#define vmul_lane_u16 vmul_lane_s16 ++ ++uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); ++#define vmul_lane_u32 vmul_lane_s32 ++ ++int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c); ++_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c) ++{ ++ int16x8_t b16x8; ++ int16_t vlane; ++ vlane = vget_lane_s16(b, c); ++ b16x8 = vdupq_n_s16(vlane); ++ return vmulq_s16(a, b16x8); ++} ++ ++int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c) ++{ ++ int32x4_t b32x4; ++ int32_t vlane; ++ vlane = vget_lane_s32(b, c); ++ b32x4 = vdupq_n_s32(vlane); ++ return vmulq_s32(a, b32x4); ++} ++ ++float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c) ++{ ++ float32x4_t b32x4; ++ float32_t vlane; ++ vlane = vget_lane_f32(b, c); ++ b32x4 = vdupq_n_f32(vlane); ++ return vmulq_f32(a, b32x4); ++} ++ ++uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); ++#define vmulq_lane_u16 vmulq_lane_s16 ++ ++uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); ++#define vmulq_lane_u32 vmulq_lane_s32 ++ ++//**** Vector long multiply with scalar ************ ++int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0] ++{ ++ int16x4_t b16x4; ++ b16x4 = vdup_n_s16(val2); ++ return vmull_s16(vec1, b16x4); ++} ++ ++int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0] ++{ ++ int32x2_t b32x2; ++ b32x2 = vdup_n_s32(val2); ++ return vmull_s32(vec1, b32x2); ++} ++ ++uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0] ++_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0] ++{ ++ uint16x4_t b16x4; ++ b16x4 = vdup_n_s16(val2); ++ return vmull_s16(vec1, b16x4); ++} ++ ++uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] ++_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0] ++{ ++ uint32x2_t b32x2; ++ b32x2 = vdup_n_u32(val2); ++ return vmull_u32(vec1, b32x2); ++} ++ ++//**** Vector long multiply by scalar **** ++int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0] ++{ ++ int16_t vlane; ++ int16x4_t b; ++ vlane = vget_lane_s16(val2, val3); ++ b = vdup_n_s16(vlane); ++ return vmull_s16(vec1, b); ++} ++ ++int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0] ++{ ++ int32_t vlane; ++ int32x2_t b; ++ vlane = vget_lane_s32(val2, val3); ++ b = vdup_n_s32(vlane); ++ return vmull_s32(vec1, b); ++} ++ ++uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0] ++_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t b; ++ vlane = vget_lane_s16(val2, val3); ++ b = vdup_n_s16(vlane); ++ return vmull_s16(vec1, b); ++} ++ ++uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] ++_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t b; ++ vlane = vget_lane_u32(val2, val3); ++ b = vdup_n_u32(vlane); ++ return vmull_u32(vec1, b); ++} ++ ++//********* Vector saturating doubling long multiply with scalar ******************* ++int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2) ++{ ++ //the serial soulution may be faster due to saturation ++ int16x4_t b; ++ b = vdup_n_s16(val2); ++ return vqdmull_s16(vec1, b); ++} ++ ++int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t b; ++ b = vdup_n_s32(val2); ++ return vqdmull_s32(vec1,b); //slow serial function!!!! ++} ++ ++//************* Vector saturating doubling long multiply by scalar *********************************************** ++int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) ++{ ++ int16_t c; ++ int16x4_t scalar; ++ c = vget_lane_s16(val2, val3); ++ scalar = vdup_n_s16(c); ++ return vqdmull_s16(vec1, scalar); ++} ++ ++ ++int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32_t c; ++ int32x2_t scalar; ++ c = vget_lane_s32(val2, val3); ++ scalar = vdup_n_s32(c); ++ return vqdmull_s32(vec1,scalar); //slow serial function!!!! ++} ++ ++// *****Vector saturating doubling multiply high with scalar ***** ++int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2) ++{ ++ int16x4_t res64; ++ return64(vqdmulhq_n_s16(_pM128i(vec1), val2)); ++} ++ ++int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2) ++{ ++ int32x2_t res64; ++ return64(vqdmulhq_n_s32(_pM128i(vec1), val2)); ++} ++ ++int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16x8_t scalar; ++ scalar = vdupq_n_s16(val2); ++ return vqdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32x4_t scalar; ++ scalar = vdupq_n_s32(val2); ++ return vqdmulhq_s32(vec1, scalar); ++} ++ ++//***** Vector saturating doubling multiply high by scalar **************** ++int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x4_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdup_n_s16(vlane); ++ return vqdmulh_s16(vec1, scalar); ++} ++ ++int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32_t vlane; ++ int32x2_t scalar; ++ vlane = vget_lane_s32(val2, val3); ++ scalar = vdup_n_s32(vlane); ++ return vqdmulh_s32(vec1, scalar); ++} ++ ++int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x8_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdupq_n_s16(vlane ); ++ return vqdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //solution may be not optimal ++ int32_t vlane; ++ int32x4_t scalar; ++ vlane = vgetq_lane_s32(_pM128i(val2), val3); ++ scalar = vdupq_n_s32(vlane ); ++ return vqdmulhq_s32(vec1, scalar); ++} ++ ++//******** Vector saturating rounding doubling multiply high with scalar *** ++int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0] ++{ ++ //solution may be not optimal ++ int16x4_t scalar; ++ scalar = vdup_n_s16(val2); ++ return vqrdmulh_s16(vec1, scalar); ++} ++ ++int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32x2_t scalar; ++ scalar = vdup_n_s32(val2); ++ return vqrdmulh_s32(vec1, scalar); ++} ++ ++int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16x8_t scalar; ++ scalar = vdupq_n_s16(val2); ++ return vqrdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32x4_t scalar; ++ scalar = vdupq_n_s32(val2); ++ return vqrdmulhq_s32(vec1, scalar); ++} ++ ++//********* Vector rounding saturating doubling multiply high by scalar **** ++int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x4_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdup_n_s16(vlane); ++ return vqrdmulh_s16(vec1, scalar); ++} ++ ++int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32_t vlane; ++ int32x2_t scalar; ++ vlane = vget_lane_s32(val2, val3); ++ scalar = vdup_n_s32(vlane); ++ return vqrdmulh_s32(vec1, scalar); ++} ++ ++int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x8_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdupq_n_s16(vlane); ++ return vqrdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //solution may be not optimal ++ int32_t vlane; ++ int32x4_t scalar; ++ vlane = vgetq_lane_s32(_pM128i(val2), val3); ++ scalar = vdupq_n_s32(vlane ); ++ return vqrdmulhq_s32(vec1, scalar); ++} ++ ++//**************Vector multiply accumulate with scalar ******************* ++int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0] ++{ ++ int16x4_t scalar; ++ scalar = vdup_n_s16(c); ++ return vmla_s16(a, b, scalar); ++} ++ ++int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0] ++{ ++ int32x2_t scalar; ++ scalar = vdup_n_s32(c); ++ return vmla_s32(a, b, scalar); ++} ++ ++uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] ++#define vmla_n_u16 vmla_n_s16 ++ ++ ++uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] ++#define vmla_n_u32 vmla_n_s32 ++ ++ ++float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0] ++{ ++ float32x2_t scalar; ++ scalar = vdup_n_f32(c); ++ return vmla_f32(a, b, scalar); ++} ++ ++int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0] ++{ ++ int16x8_t scalar; ++ scalar = vdupq_n_s16(c); ++ return vmlaq_s16(a,b,scalar); ++} ++ ++int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0] ++{ ++ int32x4_t scalar; ++ scalar = vdupq_n_s32(c); ++ return vmlaq_s32(a,b,scalar); ++} ++ ++uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] ++#define vmlaq_n_u16 vmlaq_n_s16 ++ ++uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] ++#define vmlaq_n_u32 vmlaq_n_s32 ++ ++float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0] ++{ ++ float32x4_t scalar; ++ scalar = vdupq_n_f32(c); ++ return vmlaq_f32(a,b,scalar); ++} ++ ++//************Vector widening multiply accumulate with scalar**************************** ++int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0] ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmlal_s16(a, b, vc); ++} ++ ++int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0] ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vmlal_s32(a, b, vc); ++} ++ ++uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0] ++{ ++ uint16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmlal_s16(a, b, vc); ++} ++ ++uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0] ++{ ++ uint32x2_t vc; ++ vc = vdup_n_u32(c); ++ return vmlal_u32(a, b, vc); ++} ++ ++//************ Vector widening saturating doubling multiply accumulate with scalar ************** ++int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) ++{ ++ //not optimal SIMD soulution, serial may be faster ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vqdmlal_s16(a, b, vc); ++} ++ ++int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vqdmlal_s32(a, b, vc); ++} ++ ++//******** Vector multiply subtract with scalar ************** ++int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0] ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmls_s16(a, b, vc); ++} ++ ++int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0] ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vmls_s32(a, b, vc); ++} ++ ++uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0] ++{ ++ uint16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmls_s16(a, b, vc); ++} ++ ++uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0] ++{ ++ uint32x2_t vc; ++ vc = vdup_n_u32(c); ++ return vmls_u32(a, b, vc); ++} ++ ++float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c; ++ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c; ++ return res; ++} ++ ++int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0] ++{ ++ int16x8_t vc; ++ vc = vdupq_n_s16(c); ++ return vmlsq_s16(a, b,vc); ++} ++ ++int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0] ++{ ++ int32x4_t vc; ++ vc = vdupq_n_s32(c); ++ return vmlsq_s32(a,b,vc); ++} ++ ++uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0] ++{ ++ uint32x4_t vc; ++ vc = vdupq_n_u32(c); ++ return vmlsq_u32(a,b,vc); ++} ++ ++uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0] ++{ ++ uint32x4_t vc; ++ vc = vdupq_n_u32(c); ++ return vmlsq_u32(a,b,vc); ++} ++ ++float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) ++{ ++ float32x4_t vc; ++ vc = vdupq_n_f32(c); ++ return vmlsq_f32(a,b,vc); ++} ++ ++//**** Vector widening multiply subtract with scalar ****** ++int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0] ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmlsl_s16(a, b, vc); ++} ++ ++int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0] ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vmlsl_s32(a, b, vc); ++} ++ ++uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0] ++{ ++ uint16x4_t vc; ++ vc = vdup_n_u16(c); ++ return vmlsl_u16(a, b, vc); ++} ++ ++uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0] ++{ ++ uint32x2_t vc; ++ vc = vdup_n_u32(c); ++ return vmlsl_u32(a, b, vc); ++} ++ ++//***** Vector widening saturating doubling multiply subtract with scalar ********* ++//********************************************************************************** ++int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vqdmlsl_s16(a, b, vc); ++} ++ ++int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vqdmlsl_s32(a, b, vc); ++} ++ ++//******************* Vector extract *********************************************** ++//************************************************************************************* ++//VEXT (Vector Extract) extracts elements from the bottom end of the second operand ++//vector and the top end of the first, concatenates them, and places the result in the destination vector ++//c elements from the bottom end of the second operand and (8-c) from the top end of the first ++int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int8x8_t res; ++ int i; ++ for (i = 0; i<8 - c; i++) { ++ res.m64_i8[i] = a.m64_i8[i + c]; ++ } ++ for(i = 0; i> 1); ++ tmp = _mm_srli_epi32(res, 2); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2); ++ tmp = _mm_srli_epi32(res, 4); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4); ++ tmp = _mm_srli_epi32(res, 8); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8); ++ tmp = _mm_srli_epi32(res, 16); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16); ++ ++ tmp = _mm_srli_epi32(res, 1); ++ tmp = _mm_and_si128(tmp, c55555555); ++ res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555); ++ ++ tmp = _mm_srli_epi32(res, 2); ++ tmp = _mm_and_si128(tmp, c33333333); ++ tmp1 = _mm_and_si128(res, c33333333); ++ res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333)); ++ ++ tmp = _mm_srli_epi32(res, 4); ++ tmp = _mm_add_epi32(tmp, res); ++ res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f); ++ ++ tmp = _mm_srli_epi32(res, 8); ++ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8); ++ ++ tmp = _mm_srli_epi32(res, 16); ++ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16); ++ ++ res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f; ++ ++ return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i]; ++} ++ ++uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 ++#define vclzq_u8 vclzq_s8 ++ ++uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 ++#define vclzq_u16 vclzq_s16 ++ ++uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 ++#define vclzq_u32 vclzq_s32 ++ ++//************** Count leading sign bits ************************** ++//******************************************************************** ++//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following ++// the topmost bit, that are the same as the topmost bit, in each element in a vector ++//No corresponding vector intrinsics in IA32, need to implement it. ++//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits ++int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 ++_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = vclsq_s8(_pM128i(a)); ++ return64(res); ++} ++ ++int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 ++_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = vclsq_s16(_pM128i(a)); ++ return64(res); ++} ++ ++int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 ++_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = vclsq_s32(_pM128i(a)); ++ return64(res); ++} ++ ++int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 ++_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a) ++{ ++ __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb; ++ cff = _mm_cmpeq_epi8 (a,a); //0xff ++ c80 = _mm_set1_epi8(0x80); ++ c1 = _mm_set1_epi8(1); ++ a_mask = _mm_and_si128(a, c80); ++ a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive ++ a_neg = _mm_xor_si128(a, cff); ++ a_neg = _mm_and_si128(a_mask, a_neg); ++ a_pos = _mm_andnot_si128(a_mask, a); ++ a_comb = _mm_or_si128(a_pos, a_neg); ++ a_comb = vclzq_s8(a_comb); ++ return _mm_sub_epi8(a_comb, c1); ++} ++ ++int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 ++_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a) ++{ ++ __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb; ++ cffff = _mm_cmpeq_epi16(a,a); ++ c8000 = _mm_slli_epi16(cffff, 15); //0x8000 ++ c1 = _mm_srli_epi16(cffff,15); //0x1 ++ a_mask = _mm_and_si128(a, c8000); ++ a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive ++ a_neg = _mm_xor_si128(a, cffff); ++ a_neg = _mm_and_si128(a_mask, a_neg); ++ a_pos = _mm_andnot_si128(a_mask, a); ++ a_comb = _mm_or_si128(a_pos, a_neg); ++ a_comb = vclzq_s16(a_comb); ++ return _mm_sub_epi16(a_comb, c1); ++} ++ ++int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 ++_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a) ++{ ++ __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb; ++ cffffffff = _mm_cmpeq_epi32(a,a); ++ c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000 ++ c1 = _mm_srli_epi32(cffffffff,31); //0x1 ++ a_mask = _mm_and_si128(a, c80000000); ++ a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive ++ a_neg = _mm_xor_si128(a, cffffffff); ++ a_neg = _mm_and_si128(a_mask, a_neg); ++ a_pos = _mm_andnot_si128(a_mask, a); ++ a_comb = _mm_or_si128(a_pos, a_neg); ++ a_comb = vclzq_s32(a_comb); ++ return _mm_sub_epi32(a_comb, c1); ++} ++ ++//************************* Count number of set bits ******************************** ++//************************************************************************************* ++//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element ++//another option is to do the following algorithm: ++ ++uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 ++_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a) ++{ ++ uint8x8_t res64; ++ __m128i res; ++ res = vcntq_u8(_pM128i(a)); ++ return64(res); ++} ++ ++int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 ++#define vcnt_s8 vcnt_u8 ++ ++poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 ++#define vcnt_p8 vcnt_u8 ++ ++uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 ++_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a) ++{ ++ _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2, ++ /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3, ++ /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3, ++ /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 }; ++ __m128i maskLOW, mask, lowpopcnt, hipopcnt; ++ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set ++ mask = _mm_and_si128(a, maskLOW); ++ lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway ++ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits ++ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set ++ hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway ++ return _mm_add_epi8(lowpopcnt, hipopcnt); ++} ++ ++int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 ++#define vcntq_s8 vcntq_u8 ++ ++poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 ++#define vcntq_p8 vcntq_u8 ++ ++//************************************************************************************** ++//*********************** Logical operations **************************************** ++//************************************************************************************** ++//************************** Bitwise not *********************************** ++//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance ++int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 ++_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = vmvnq_s8(_pM128i(a)); ++ return64(res); ++} ++ ++int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 ++_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = vmvnq_s16(_pM128i(a)); ++ return64(res); ++} ++ ++int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 ++_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = vmvnq_s32(_pM128i(a)); ++ return64(res); ++} ++ ++uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 ++#define vmvn_u8 vmvn_s8 ++ ++uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 ++#define vmvn_u16 vmvn_s16 ++ ++uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 ++#define vmvn_u32 vmvn_s32 ++ ++poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 ++#define vmvn_p8 vmvn_u8 ++ ++int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 ++_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0 ++{ ++ __m128i c1; ++ c1 = _mm_cmpeq_epi8 (a,a); //0xff ++ return _mm_andnot_si128 (a, c1); ++} ++ ++int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 ++_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0 ++{ ++ __m128i c1; ++ c1 = _mm_cmpeq_epi16 (a,a); //0xffff ++ return _mm_andnot_si128 (a, c1); ++} ++ ++int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 ++_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0 ++{ ++ __m128i c1; ++ c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff ++ return _mm_andnot_si128 (a, c1); ++} ++ ++uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 ++#define vmvnq_u8 vmvnq_s8 ++ ++uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 ++#define vmvnq_u16 vmvnq_s16 ++ ++uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 ++#define vmvnq_u32 vmvnq_s32 ++ ++poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 ++#define vmvnq_p8 vmvnq_u8 ++ ++//****************** Bitwise and *********************** ++//****************************************************** ++int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); ++} ++ ++int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); ++} ++ ++int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0]; ++ return res; ++} ++ ++uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 ++#define vand_u8 vand_s8 ++ ++uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 ++#define vand_u16 vand_s16 ++ ++uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 ++#define vand_u32 vand_s32 ++ ++uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 ++#define vand_u64 vand_s64 ++ ++ ++int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 ++#define vandq_s8 _mm_and_si128 ++ ++int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 ++#define vandq_s16 _mm_and_si128 ++ ++int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 ++#define vandq_s32 _mm_and_si128 ++ ++int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 ++#define vandq_s64 _mm_and_si128 ++ ++uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 ++#define vandq_u8 _mm_and_si128 ++ ++uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 ++#define vandq_u16 _mm_and_si128 ++ ++uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 ++#define vandq_u32 _mm_and_si128 ++ ++uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 ++#define vandq_u64 _mm_and_si128 ++ ++//******************** Bitwise or ********************************* ++//****************************************************************** ++int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0]; ++ return res; ++} ++ ++uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 ++#define vorr_u8 vorr_s8 ++ ++uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 ++#define vorr_u16 vorr_s16 ++ ++uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 ++#define vorr_u32 vorr_s32 ++ ++uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 ++#define vorr_u64 vorr_s64 ++ ++int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 ++#define vorrq_s8 _mm_or_si128 ++ ++int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 ++#define vorrq_s16 _mm_or_si128 ++ ++int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 ++#define vorrq_s32 _mm_or_si128 ++ ++int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 ++#define vorrq_s64 _mm_or_si128 ++ ++uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 ++#define vorrq_u8 _mm_or_si128 ++ ++uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 ++#define vorrq_u16 _mm_or_si128 ++ ++uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 ++#define vorrq_u32 _mm_or_si128 ++ ++uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 ++#define vorrq_u64 _mm_or_si128 ++ ++//************* Bitwise exclusive or (EOR or XOR) ****************** ++//******************************************************************* ++int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_xor_si128(_pM128i(a),_pM128i(b))); ++} ++ ++int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 ++#define veor_s16 veor_s8 ++ ++int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 ++#define veor_s32 veor_s8 ++ ++int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0]; ++ return res; ++} ++ ++uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 ++#define veor_u8 veor_s8 ++ ++uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 ++#define veor_u16 veor_s16 ++ ++uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 ++#define veor_u32 veor_s32 ++ ++uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 ++#define veor_u64 veor_s64 ++ ++int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 ++#define veorq_s8 _mm_xor_si128 ++ ++int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 ++#define veorq_s16 _mm_xor_si128 ++ ++int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 ++#define veorq_s32 _mm_xor_si128 ++ ++int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 ++#define veorq_s64 _mm_xor_si128 ++ ++uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 ++#define veorq_u8 _mm_xor_si128 ++ ++uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 ++#define veorq_u16 _mm_xor_si128 ++ ++uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 ++#define veorq_u32 _mm_xor_si128 ++ ++uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 ++#define veorq_u64 _mm_xor_si128 ++ ++//********************** Bit Clear ********************************** ++//******************************************************************* ++//Logical AND complement (AND negation or AND NOT) ++int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap" ++} ++ ++int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 ++#define vbic_s16 vbic_s8 ++ ++int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 ++#define vbic_s32 vbic_s8 ++ ++int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]); ++ return res; ++} ++ ++uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 ++#define vbic_u8 vbic_s8 ++ ++uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 ++#define vbic_u16 vbic_s16 ++ ++uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 ++#define vbic_u32 vbic_s32 ++ ++uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 ++#define vbic_u64 vbic_s64 ++ ++int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 ++#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 ++#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 ++#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 ++#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 ++#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 ++#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 ++#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 ++#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++//**************** Bitwise OR complement ******************************** ++//**************************************** ******************************** ++//no exact IA 32 match, need to implement it as following ++int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vornq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vornq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vornq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]); ++ return res; ++} ++ ++uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 ++#define vorn_u8 vorn_s8 ++ ++ ++uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 ++#define vorn_u16 vorn_s16 ++ ++uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 ++#define vorn_u32 vorn_s32 ++ ++uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 ++#define vorn_u64 vorn_s64 ++ ++ ++int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s8( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s16( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s32( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b) ++{ ++ __m128i c1, b1; ++ c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff ++ b1 = _mm_andnot_si128 (b, c1); ++ return _mm_or_si128 (a, b1); ++} ++ ++uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_u8( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s16( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_u32( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 ++#define vornq_u64 vornq_s64 ++ ++//********************* Bitwise Select ***************************** ++//****************************************************************** ++//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????) ++ ++//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the ++//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0. ++ ++//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination ++//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged ++ ++//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination ++//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged. ++ ++//VBSL only is implemented for SIMD ++int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c)); ++ return64(res); ++} ++ ++int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 ++#define vbsl_s16 vbsl_s8 ++ ++int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 ++#define vbsl_s32 vbsl_s8 ++ ++int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]); ++ return res; ++} ++ ++uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 ++#define vbsl_u8 vbsl_s8 ++ ++uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 ++#define vbsl_u16 vbsl_s8 ++ ++uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 ++#define vbsl_u32 vbsl_s8 ++ ++uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 ++#define vbsl_u64 vbsl_s64 ++ ++float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) ++{ ++ __m128 sel1, sel2; ++ __m64_128 res64; ++ sel1 = _mm_and_ps (_pM128(a), _pM128(b)); ++ sel2 = _mm_andnot_ps (_pM128(a), _pM128(c)); ++ sel1 = _mm_or_ps (sel1, sel2); ++ _M64f(res64, sel1); ++ return res64; ++} ++ ++poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 ++#define vbsl_p8 vbsl_s8 ++ ++poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 ++#define vbsl_p16 vbsl_s8 ++ ++int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0 ++{ ++ __m128i sel1, sel2; ++ sel1 = _mm_and_si128 (a, b); ++ sel2 = _mm_andnot_si128 (a, c); ++ return _mm_or_si128 (sel1, sel2); ++} ++ ++int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 ++#define vbslq_s16 vbslq_s8 ++ ++int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 ++#define vbslq_s32 vbslq_s8 ++ ++int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 ++#define vbslq_s64 vbslq_s8 ++ ++uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 ++#define vbslq_u8 vbslq_s8 ++ ++uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 ++#define vbslq_u16 vbslq_s8 ++ ++uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 ++#define vbslq_u32 vbslq_s8 ++ ++uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 ++#define vbslq_u64 vbslq_s8 ++ ++float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0 ++{ ++ __m128 sel1, sel2; ++ sel1 = _mm_and_ps (*(__m128*)&a, b); ++ sel2 = _mm_andnot_ps (*(__m128*)&a, c); ++ return _mm_or_ps (sel1, sel2); ++} ++ ++poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 ++#define vbslq_p8 vbslq_u8 ++ ++poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 ++#define vbslq_p16 vbslq_s8 ++ ++//************************************************************************************ ++//**************** Transposition operations **************************************** ++//************************************************************************************ ++//***************** Vector Transpose ************************************************ ++//************************************************************************************ ++//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices. ++// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....) ++int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 ++_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0 ++{ ++ int8x8x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp ++ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7) ++ vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6), ++ return val; ++} ++ ++int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 ++_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0 ++{ ++ int16x4x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15}; ++ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3 ++ vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2), ++ return val; ++} ++ ++int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 ++_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1 ++ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0, ++ return val; ++} ++ ++uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 ++#define vtrn_u8 vtrn_s8 ++ ++uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 ++#define vtrn_u16 vtrn_s16 ++ ++uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 ++#define vtrn_u32 vtrn_s32 ++ ++float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 ++_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x2x2_t val; ++ val.val[0].m64_f32[0] = a.m64_f32[0]; ++ val.val[0].m64_f32[1] = b.m64_f32[0]; ++ val.val[1].m64_f32[0] = a.m64_f32[1]; ++ val.val[1].m64_f32[1] = b.m64_f32[1]; ++ return val; //a0,b0,a1,b1 ++} ++ ++poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 ++#define vtrn_p8 vtrn_u8 ++ ++poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 ++#define vtrn_p16 vtrn_s16 ++ ++//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 ++_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0 ++{ ++ int8x16x2_t r8x16; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 ++ ++ r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14) ++ r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15) ++ return r8x16; ++} ++ ++int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 ++_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0 ++{ ++ int16x8x2_t v16x8; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 ++ v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6 ++ v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7 ++ return v16x8; ++} ++ ++int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 ++_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0 ++{ ++ //may be not optimal solution compared with serial ++ int32x4x2_t v32x4; ++ __m128i a_sh, b_sh; ++ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 ++ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 ++ ++ v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2 ++ v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3 ++ return v32x4; ++} ++ ++uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 ++#define vtrnq_u8 vtrnq_s8 ++ ++uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 ++#define vtrnq_u16 vtrnq_s16 ++ ++uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 ++#define vtrnq_u32 vtrnq_s32 ++ ++float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 ++_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0 ++{ ++ //may be not optimal solution compared with serial ++ float32x4x2_t f32x4; ++ __m128 a_sh, b_sh; ++ a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness ++ b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness ++ ++ f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2 ++ f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3 ++ return f32x4; ++} ++ ++poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 ++#define vtrnq_p8 vtrnq_s8 ++ ++poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 ++#define vtrnq_p16 vtrnq_s16 ++ ++//***************** Interleave elements *************************** ++//***************************************************************** ++//output has (a0,b0,a1,b1, a2,b2,.....) ++int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 ++_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0 ++{ ++ int8x8x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); ++ vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 ++_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0 ++{ ++ int16x4x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); ++ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 ++#define vzip_s32 vtrn_s32 ++ ++uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 ++#define vzip_u8 vzip_s8 ++ ++uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 ++#define vzip_u16 vzip_s16 ++ ++uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 ++#define vzip_u32 vzip_s32 ++ ++float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 ++#define vzip_f32 vtrn_f32 ++ ++poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 ++#define vzip_p8 vzip_u8 ++ ++poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 ++#define vzip_p16 vzip_u16 ++ ++int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 ++_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0 ++{ ++ int8x16x2_t r8x16; ++ r8x16.val[0] = _mm_unpacklo_epi8(a, b); ++ r8x16.val[1] = _mm_unpackhi_epi8(a, b); ++ return r8x16; ++} ++ ++int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 ++_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0 ++{ ++ int16x8x2_t r16x8; ++ r16x8.val[0] = _mm_unpacklo_epi16(a, b); ++ r16x8.val[1] = _mm_unpackhi_epi16(a, b); ++ return r16x8; ++} ++ ++int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 ++_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0 ++{ ++ int32x4x2_t r32x4; ++ r32x4.val[0] = _mm_unpacklo_epi32(a, b); ++ r32x4.val[1] = _mm_unpackhi_epi32(a, b); ++ return r32x4; ++} ++ ++uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 ++#define vzipq_u8 vzipq_s8 ++ ++uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 ++#define vzipq_u16 vzipq_s16 ++ ++uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 ++#define vzipq_u32 vzipq_s32 ++ ++float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 ++_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0 ++{ ++ float32x4x2_t f32x4; ++ f32x4.val[0] = _mm_unpacklo_ps ( a, b); ++ f32x4.val[1] = _mm_unpackhi_ps ( a, b); ++ return f32x4; ++} ++ ++poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 ++#define vzipq_p8 vzipq_u8 ++ ++poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 ++#define vzipq_p16 vzipq_u16 ++ ++//*********************** De-Interleave elements ************************* ++//************************************************************************* ++//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...) ++//no such functions in IA32 SIMD, shuffle is required ++int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 ++_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0 ++{ ++ int8x8x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15}; ++ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7) ++ vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 ++_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0 ++{ ++ int16x4x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; ++ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3 ++ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 ++_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0 ++{ ++ int32x2x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1 ++ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 ++#define vuzp_u8 vuzp_s8 ++ ++uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 ++#define vuzp_u16 vuzp_s16 ++ ++uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 ++#define vuzp_u32 vuzp_s32 ++ ++float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 ++#define vuzp_f32 vzip_f32 ++ ++poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 ++#define vuzp_p8 vuzp_u8 ++ ++poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 ++#define vuzp_p16 vuzp_u16 ++ ++int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 ++_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0 ++{ ++ int8x16x2_t v8x16; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 ++ //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b ++ v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14, ++ v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15 ++ return v8x16; ++} ++ ++int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 ++_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0 ++{ ++ int16x8x2_t v16x8; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 ++ v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6 ++ v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7 ++ return v16x8; ++} ++ ++int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 ++_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0 ++{ ++ //may be not optimal solution compared with serial ++ int32x4x2_t v32x4; ++ __m128i a_sh, b_sh; ++ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 ++ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 ++ ++ v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2 ++ v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3 ++ return v32x4; ++} ++ ++uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 ++#define vuzpq_u8 vuzpq_s8 ++ ++uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 ++#define vuzpq_u16 vuzpq_s16 ++ ++uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 ++#define vuzpq_u32 vuzpq_s32 ++ ++float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 ++_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0 ++{ ++ float32x4x2_t v32x4; ++ v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however ++ v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however ++ return v32x4; ++} ++ ++poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 ++#define vuzpq_p8 vuzpq_u8 ++ ++poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 ++#define vuzpq_p16 vuzpq_u16 ++ ++//############################################################################################## ++//*********************** Reinterpret cast intrinsics.****************************************** ++//############################################################################################## ++// Not a part of oficial NEON instruction set but available in gcc compiler ********************* ++poly8x8_t vreinterpret_p8_u32 (uint32x2_t t); ++#define vreinterpret_p8_u32 ++ ++poly8x8_t vreinterpret_p8_u16 (uint16x4_t t); ++#define vreinterpret_p8_u16 ++ ++poly8x8_t vreinterpret_p8_u8 (uint8x8_t t); ++#define vreinterpret_p8_u8 ++ ++poly8x8_t vreinterpret_p8_s32 (int32x2_t t); ++#define vreinterpret_p8_s32 ++ ++poly8x8_t vreinterpret_p8_s16 (int16x4_t t); ++#define vreinterpret_p8_s16 ++ ++poly8x8_t vreinterpret_p8_s8 (int8x8_t t); ++#define vreinterpret_p8_s8 ++ ++poly8x8_t vreinterpret_p8_u64 (uint64x1_t t); ++#define vreinterpret_p8_u64 ++ ++poly8x8_t vreinterpret_p8_s64 (int64x1_t t); ++#define vreinterpret_p8_s64 ++ ++poly8x8_t vreinterpret_p8_f32 (float32x2_t t); ++#define vreinterpret_p8_f32 ++ ++poly8x8_t vreinterpret_p8_p16 (poly16x4_t t); ++#define vreinterpret_p8_p16 ++ ++poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t); ++#define vreinterpretq_p8_u32 ++ ++poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t); ++#define vreinterpretq_p8_u16 ++ ++poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t); ++#define vreinterpretq_p8_u8 ++ ++poly8x16_t vreinterpretq_p8_s32 (int32x4_t t); ++#define vreinterpretq_p8_s32 ++ ++poly8x16_t vreinterpretq_p8_s16 (int16x8_t t); ++#define vreinterpretq_p8_s16 ++ ++poly8x16_t vreinterpretq_p8_s8 (int8x16_t t); ++#define vreinterpretq_p8_s8 ++ ++poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t); ++#define vreinterpretq_p8_u64 ++ ++poly8x16_t vreinterpretq_p8_s64 (int64x2_t t); ++#define vreinterpretq_p8_s64 ++ ++poly8x16_t vreinterpretq_p8_f32 (float32x4_t t); ++#define vreinterpretq_p8_f32(t) _M128i(t) ++ ++poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t); ++#define vreinterpretq_p8_p16 ++ ++poly16x4_t vreinterpret_p16_u32 (uint32x2_t t); ++#define vreinterpret_p16_u32 ++ ++poly16x4_t vreinterpret_p16_u16 (uint16x4_t t); ++#define vreinterpret_p16_u16 ++ ++poly16x4_t vreinterpret_p16_u8 (uint8x8_t t); ++#define vreinterpret_p16_u8 ++ ++poly16x4_t vreinterpret_p16_s32 (int32x2_t t); ++#define vreinterpret_p16_s32 ++ ++poly16x4_t vreinterpret_p16_s16 (int16x4_t t); ++#define vreinterpret_p16_s16 ++ ++poly16x4_t vreinterpret_p16_s8 (int8x8_t t); ++#define vreinterpret_p16_s8 ++ ++poly16x4_t vreinterpret_p16_u64 (uint64x1_t t); ++#define vreinterpret_p16_u64 ++ ++poly16x4_t vreinterpret_p16_s64 (int64x1_t t); ++#define vreinterpret_p16_s64 ++ ++poly16x4_t vreinterpret_p16_f32 (float32x2_t t); ++#define vreinterpret_p16_f32 ++ ++poly16x4_t vreinterpret_p16_p8 (poly8x8_t t); ++#define vreinterpret_p16_p8 ++ ++poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t); ++#define vreinterpretq_p16_u32 ++ ++poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t); ++#define vreinterpretq_p16_u16 ++ ++poly16x8_t vreinterpretq_p16_s32 (int32x4_t t); ++#define vreinterpretq_p16_s32 ++ ++poly16x8_t vreinterpretq_p16_s16 (int16x8_t t); ++#define vreinterpretq_p16_s16 ++ ++poly16x8_t vreinterpretq_p16_s8 (int8x16_t t); ++#define vreinterpretq_p16_s8 ++ ++poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t); ++#define vreinterpretq_p16_u64 ++ ++poly16x8_t vreinterpretq_p16_s64 (int64x2_t t); ++#define vreinterpretq_p16_s64 ++ ++poly16x8_t vreinterpretq_p16_f32 (float32x4_t t); ++#define vreinterpretq_p16_f32(t) _M128i(t) ++ ++poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t); ++#define vreinterpretq_p16_p8 vreinterpretq_s16_p8 ++ ++//**** Integer to float ****** ++float32x2_t vreinterpret_f32_u32 (uint32x2_t t); ++#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t)) ++ ++ ++float32x2_t vreinterpret_f32_u16 (uint16x4_t t); ++#define vreinterpret_f32_u16 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_u8 (uint8x8_t t); ++#define vreinterpret_f32_u8 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_s32 (int32x2_t t); ++#define vreinterpret_f32_s32 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_s16 (int16x4_t t); ++#define vreinterpret_f32_s16 vreinterpret_f32_u32 ++ ++float32x2_t vreinterpret_f32_s8 (int8x8_t t); ++#define vreinterpret_f32_s8 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_u64(uint64x1_t t); ++#define vreinterpret_f32_u64 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_s64 (int64x1_t t); ++#define vreinterpret_f32_s64 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_p16 (poly16x4_t t); ++#define vreinterpret_f32_p16 vreinterpret_f32_u32 ++ ++float32x2_t vreinterpret_f32_p8 (poly8x8_t t); ++#define vreinterpret_f32_p8 vreinterpret_f32_u32 ++ ++float32x4_t vreinterpretq_f32_u32 (uint32x4_t t); ++#define vreinterpretq_f32_u32(t) *(__m128*)&(t) ++ ++float32x4_t vreinterpretq_f32_u16 (uint16x8_t t); ++#define vreinterpretq_f32_u16 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_u8 (uint8x16_t t); ++#define vreinterpretq_f32_u8 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s32 (int32x4_t t); ++#define vreinterpretq_f32_s32 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s16 (int16x8_t t); ++#define vreinterpretq_f32_s16 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s8 (int8x16_t t); ++#define vreinterpretq_f32_s8 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_u64 (uint64x2_t t); ++#define vreinterpretq_f32_u64 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s64 (int64x2_t t); ++#define vreinterpretq_f32_s64 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_p16 (poly16x8_t t); ++#define vreinterpretq_f32_p16 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_p8 (poly8x16_t t); ++#define vreinterpretq_f32_p8 vreinterpretq_f32_u32 ++ ++//*** Integer type conversions ****************** ++//no conversion necessary for the following functions because it is same data type ++int64x1_t vreinterpret_s64_u32 (uint32x2_t t); ++#define vreinterpret_s64_u32 ++ ++int64x1_t vreinterpret_s64_u16 (uint16x4_t t); ++#define vreinterpret_s64_u16 ++ ++int64x1_t vreinterpret_s64_u8 (uint8x8_t t); ++#define vreinterpret_s64_u8 ++ ++int64x1_t vreinterpret_s64_s32 (int32x2_t t); ++#define vreinterpret_s64_s32 ++ ++int64x1_t vreinterpret_s64_s16 (int16x4_t t); ++#define vreinterpret_s64_s16 ++ ++int64x1_t vreinterpret_s64_s8 (int8x8_t t); ++#define vreinterpret_s64_s8 ++ ++int64x1_t vreinterpret_s64_u64 (uint64x1_t t); ++#define vreinterpret_s64_u64 ++ ++int64x1_t vreinterpret_s64_f32 (float32x2_t t); ++#define vreinterpret_s64_f32 ++ ++int64x1_t vreinterpret_s64_p16 (poly16x4_t t); ++#define vreinterpret_s64_p16 ++ ++int64x1_t vreinterpret_s64_p8 (poly8x8_t t); ++#define vreinterpret_s64_p8 ++ ++int64x2_t vreinterpretq_s64_u32 (uint32x4_t t); ++#define vreinterpretq_s64_u32 ++ ++int64x2_t vreinterpretq_s64_s16 (uint16x8_t t); ++#define vreinterpretq_s64_s16 ++ ++int64x2_t vreinterpretq_s64_u8 (uint8x16_t t); ++#define vreinterpretq_s64_u8 ++ ++int64x2_t vreinterpretq_s64_s32 (int32x4_t t); ++#define vreinterpretq_s64_s32 ++ ++int64x2_t vreinterpretq_s64_u16 (int16x8_t t); ++#define vreinterpretq_s64_u16 ++ ++int64x2_t vreinterpretq_s64_s8 (int8x16_t t); ++#define vreinterpretq_s64_s8 ++ ++int64x2_t vreinterpretq_s64_u64 (uint64x2_t t); ++#define vreinterpretq_s64_u64 ++ ++int64x2_t vreinterpretq_s64_f32 (float32x4_t t); ++#define vreinterpretq_s64_f32(t) _M128i(t) ++ ++int64x2_t vreinterpretq_s64_p16 (poly16x8_t t); ++#define vreinterpretq_s64_p16 ++ ++int64x2_t vreinterpretq_s64_p8 (poly8x16_t t); ++#define vreinterpretq_s64_p8 ++ ++uint64x1_t vreinterpret_u64_u32 (uint32x2_t t); ++#define vreinterpret_u64_u32 ++ ++uint64x1_t vreinterpret_u64_u16 (uint16x4_t t); ++#define vreinterpret_u64_u16 ++ ++uint64x1_t vreinterpret_u64_u8 (uint8x8_t t); ++#define vreinterpret_u64_u8 ++ ++uint64x1_t vreinterpret_u64_s32 (int32x2_t t); ++#define vreinterpret_u64_s32 ++ ++uint64x1_t vreinterpret_u64_s16 (int16x4_t t); ++#define vreinterpret_u64_s16 ++ ++uint64x1_t vreinterpret_u64_s8 (int8x8_t t); ++#define vreinterpret_u64_s8 ++ ++uint64x1_t vreinterpret_u64_s64 (int64x1_t t); ++#define vreinterpret_u64_s64 ++ ++uint64x1_t vreinterpret_u64_f32 (float32x2_t t); ++#define vreinterpret_u64_f32 ++ ++uint64x1_t vreinterpret_u64_p16 (poly16x4_t t); ++#define vreinterpret_u64_p16 ++ ++uint64x1_t vreinterpret_u64_p8 (poly8x8_t t); ++#define vreinterpret_u64_p8 ++ ++uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t); ++#define vreinterpretq_u64_u32 ++ ++uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t); ++#define vreinterpretq_u64_u16 ++ ++uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t); ++#define vreinterpretq_u64_u8 ++ ++uint64x2_t vreinterpretq_u64_s32 (int32x4_t t); ++#define vreinterpretq_u64_s32 ++ ++uint64x2_t vreinterpretq_u64_s16 (int16x8_t t); ++#define vreinterpretq_u64_s16 ++ ++uint64x2_t vreinterpretq_u64_s8 (int8x16_t t); ++#define vreinterpretq_u64_s8 ++ ++uint64x2_t vreinterpretq_u64_s64 (int64x2_t t); ++#define vreinterpretq_u64_s64 ++ ++uint64x2_t vreinterpretq_u64_f32 (float32x4_t t); ++#define vreinterpretq_u64_f32(t) _M128i(t) ++ ++uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t); ++#define vreinterpretq_u64_p16 ++ ++uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t); ++#define vreinterpretq_u64_p8 ++ ++int8x8_t vreinterpret_s8_u32 (uint32x2_t t); ++#define vreinterpret_s8_u32 ++ ++int8x8_t vreinterpret_s8_u16 (uint16x4_t t); ++#define vreinterpret_s8_u16 ++ ++int8x8_t vreinterpret_s8_u8 (uint8x8_t t); ++#define vreinterpret_s8_u8 ++ ++int8x8_t vreinterpret_s8_s32 (int32x2_t t); ++#define vreinterpret_s8_s32 ++ ++int8x8_t vreinterpret_s8_s16 (int16x4_t t); ++#define vreinterpret_s8_s16 ++ ++int8x8_t vreinterpret_s8_u64 (uint64x1_t t); ++#define vreinterpret_s8_u64 ++ ++int8x8_t vreinterpret_s8_s64 (int64x1_t t); ++#define vreinterpret_s8_s64 ++ ++int8x8_t vreinterpret_s8_f32 (float32x2_t t); ++#define vreinterpret_s8_f32 ++ ++int8x8_t vreinterpret_s8_p16 (poly16x4_t t); ++#define vreinterpret_s8_p16 ++ ++int8x8_t vreinterpret_s8_p8 (poly8x8_t t); ++#define vreinterpret_s8_p8 ++ ++int8x16_t vreinterpretq_s8_u32 (uint32x4_t t); ++#define vreinterpretq_s8_u32 ++ ++int8x16_t vreinterpretq_s8_u16 (uint16x8_t t); ++#define vreinterpretq_s8_u16 ++ ++int8x16_t vreinterpretq_s8_u8 (uint8x16_t t); ++#define vreinterpretq_s8_u8 ++ ++int8x16_t vreinterpretq_s8_s32 (int32x4_t t); ++#define vreinterpretq_s8_s32 ++ ++int8x16_t vreinterpretq_s8_s16 (int16x8_t t); ++#define vreinterpretq_s8_s16 ++ ++int8x16_t vreinterpretq_s8_u64 (uint64x2_t t); ++#define vreinterpretq_s8_u64 ++ ++int8x16_t vreinterpretq_s8_s64 (int64x2_t t); ++#define vreinterpretq_s8_s64 ++ ++int8x16_t vreinterpretq_s8_f32 (float32x4_t t); ++#define vreinterpretq_s8_f32(t) _M128i(t) ++ ++int8x16_t vreinterpretq_s8_p16 (poly16x8_t t); ++#define vreinterpretq_s8_p16 ++ ++int8x16_t vreinterpretq_s8_p8 (poly8x16_t t); ++#define vreinterpretq_s8_p8 ++ ++int16x4_t vreinterpret_s16_u32 (uint32x2_t t); ++#define vreinterpret_s16_u32 ++ ++int16x4_t vreinterpret_s16_u16 (uint16x4_t t); ++#define vreinterpret_s16_u16 ++ ++int16x4_t vreinterpret_s16_u8 (uint8x8_t t); ++#define vreinterpret_s16_u8 ++ ++int16x4_t vreinterpret_s16_s32 (int32x2_t t); ++#define vreinterpret_s16_s32 ++ ++int16x4_t vreinterpret_s16_s8 (int8x8_t t); ++#define vreinterpret_s16_s8 ++ ++int16x4_t vreinterpret_s16_u64 (uint64x1_t t); ++#define vreinterpret_s16_u64 ++ ++int16x4_t vreinterpret_s16_s64 (int64x1_t t); ++#define vreinterpret_s16_s64 ++ ++int16x4_t vreinterpret_s16_f32 (float32x2_t t); ++#define vreinterpret_s16_f32 ++ ++ ++int16x4_t vreinterpret_s16_p16 (poly16x4_t t); ++#define vreinterpret_s16_p16 ++ ++int16x4_t vreinterpret_s16_p8 (poly8x8_t t); ++#define vreinterpret_s16_p8 ++ ++int16x8_t vreinterpretq_s16_u32 (uint32x4_t t); ++#define vreinterpretq_s16_u32 ++ ++int16x8_t vreinterpretq_s16_u16 (uint16x8_t t); ++#define vreinterpretq_s16_u16 ++ ++int16x8_t vreinterpretq_s16_u8 (uint8x16_t t); ++#define vreinterpretq_s16_u8 ++ ++int16x8_t vreinterpretq_s16_s32 (int32x4_t t); ++#define vreinterpretq_s16_s32 ++ ++int16x8_t vreinterpretq_s16_s8 (int8x16_t t); ++#define vreinterpretq_s16_s8 ++ ++int16x8_t vreinterpretq_s16_u64 (uint64x2_t t); ++#define vreinterpretq_s16_u64 ++ ++int16x8_t vreinterpretq_s16_s64 (int64x2_t t); ++#define vreinterpretq_s16_s64 ++ ++int16x8_t vreinterpretq_s16_f32 (float32x4_t t); ++#define vreinterpretq_s16_f32(t) _M128i(t) ++ ++int16x8_t vreinterpretq_s16_p16 (poly16x8_t t); ++#define vreinterpretq_s16_p16 ++ ++int16x8_t vreinterpretq_s16_p8 (poly8x16_t t); ++#define vreinterpretq_s16_p8 ++ ++int32x2_t vreinterpret_s32_u32 (uint32x2_t t); ++#define vreinterpret_s32_u32 ++ ++int32x2_t vreinterpret_s32_u16 (uint16x4_t t); ++#define vreinterpret_s32_u16 ++ ++int32x2_t vreinterpret_s32_u8 (uint8x8_t t); ++#define vreinterpret_s32_u8 ++ ++int32x2_t vreinterpret_s32_s16 (int16x4_t t); ++#define vreinterpret_s32_s16 ++ ++int32x2_t vreinterpret_s32_s8 (int8x8_t t); ++#define vreinterpret_s32_s8 ++ ++int32x2_t vreinterpret_s32_u64 (uint64x1_t t); ++#define vreinterpret_s32_u64 ++ ++int32x2_t vreinterpret_s32_s64 (int64x1_t t); ++#define vreinterpret_s32_s64 ++ ++int32x2_t vreinterpret_s32_f32 (float32x2_t t); ++#define vreinterpret_s32_f32 ++ ++int32x2_t vreinterpret_s32_p16 (poly16x4_t t); ++#define vreinterpret_s32_p16 ++ ++int32x2_t vreinterpret_s32_p8 (poly8x8_t t); ++#define vreinterpret_s32_p8 ++ ++int32x4_t vreinterpretq_s32_u32 (uint32x4_t t); ++#define vreinterpretq_s32_u32 ++ ++int32x4_t vreinterpretq_s32_u16 (uint16x8_t t); ++#define vreinterpretq_s32_u16 ++ ++int32x4_t vreinterpretq_s32_u8 (uint8x16_t t); ++#define vreinterpretq_s32_u8 ++ ++int32x4_t vreinterpretq_s32_s16 (int16x8_t t); ++#define vreinterpretq_s32_s16 ++ ++int32x4_t vreinterpretq_s32_s8 (int8x16_t t); ++#define vreinterpretq_s32_s8 ++ ++int32x4_t vreinterpretq_s32_u64 (uint64x2_t t); ++#define vreinterpretq_s32_u64 ++ ++int32x4_t vreinterpretq_s32_s64 (int64x2_t t); ++#define vreinterpretq_s32_s64 ++ ++int32x4_t vreinterpretq_s32_f32 (float32x4_t t); ++#define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t)) ++ ++int32x4_t vreinterpretq_s32_p16 (poly16x8_t t); ++#define vreinterpretq_s32_p16 ++ ++int32x4_t vreinterpretq_s32_p8 (poly8x16_t t); ++#define vreinterpretq_s32_p8 ++ ++uint8x8_t vreinterpret_u8_u32 (uint32x2_t t); ++#define vreinterpret_u8_u32 ++ ++uint8x8_t vreinterpret_u8_u16 (uint16x4_t t); ++#define vreinterpret_u8_u16 ++ ++uint8x8_t vreinterpret_u8_s32 (int32x2_t t); ++#define vreinterpret_u8_s32 ++ ++uint8x8_t vreinterpret_u8_s16 (int16x4_t t); ++#define vreinterpret_u8_s16 ++ ++uint8x8_t vreinterpret_u8_s8 (int8x8_t t); ++#define vreinterpret_u8_s8 ++ ++uint8x8_t vreinterpret_u8_u64 (uint64x1_t t); ++#define vreinterpret_u8_u64 ++ ++uint8x8_t vreinterpret_u8_s64 (int64x1_t t); ++#define vreinterpret_u8_s64 ++ ++uint8x8_t vreinterpret_u8_f32 (float32x2_t t); ++#define vreinterpret_u8_f32 ++ ++uint8x8_t vreinterpret_u8_p16 (poly16x4_t t); ++#define vreinterpret_u8_p16 ++ ++uint8x8_t vreinterpret_u8_p8 (poly8x8_t t); ++#define vreinterpret_u8_p8 ++ ++uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t); ++#define vreinterpretq_u8_u32 ++ ++uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t); ++#define vreinterpretq_u8_u16 ++ ++uint8x16_t vreinterpretq_u8_s32 (int32x4_t t); ++#define vreinterpretq_u8_s32 ++ ++uint8x16_t vreinterpretq_u8_s16 (int16x8_t t); ++#define vreinterpretq_u8_s16 ++ ++uint8x16_t vreinterpretq_u8_s8 (int8x16_t t); ++#define vreinterpretq_u8_s8 ++ ++uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t); ++#define vreinterpretq_u8_u64 ++ ++uint8x16_t vreinterpretq_u8_s64 (int64x2_t t); ++#define vreinterpretq_u8_s64 ++ ++uint8x16_t vreinterpretq_u8_f32 (float32x4_t t); ++#define vreinterpretq_u8_f32(t) _M128i(t) ++ ++ ++uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t); ++#define vreinterpretq_u8_p16 ++ ++uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t); ++#define vreinterpretq_u8_p8 ++ ++uint16x4_t vreinterpret_u16_u32 (uint32x2_t t); ++#define vreinterpret_u16_u32 ++ ++uint16x4_t vreinterpret_u16_u8 (uint8x8_t t); ++#define vreinterpret_u16_u8 ++ ++uint16x4_t vreinterpret_u16_s32 (int32x2_t t); ++#define vreinterpret_u16_s32 ++ ++uint16x4_t vreinterpret_u16_s16 (int16x4_t t); ++#define vreinterpret_u16_s16 ++ ++uint16x4_t vreinterpret_u16_s8 (int8x8_t t); ++#define vreinterpret_u16_s8 ++ ++uint16x4_t vreinterpret_u16_u64 (uint64x1_t t); ++#define vreinterpret_u16_u64 ++ ++uint16x4_t vreinterpret_u16_s64 (int64x1_t t); ++#define vreinterpret_u16_s64 ++ ++uint16x4_t vreinterpret_u16_f32 (float32x2_t t); ++#define vreinterpret_u16_f32 ++ ++uint16x4_t vreinterpret_u16_p16 (poly16x4_t t); ++#define vreinterpret_u16_p16 ++ ++uint16x4_t vreinterpret_u16_p8 (poly8x8_t t); ++#define vreinterpret_u16_p8 ++ ++uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t); ++#define vreinterpretq_u16_u32 ++ ++uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t); ++#define vreinterpretq_u16_u8 ++ ++uint16x8_t vreinterpretq_u16_s32 (int32x4_t t); ++#define vreinterpretq_u16_s32 ++ ++uint16x8_t vreinterpretq_u16_s16 (int16x8_t t); ++#define vreinterpretq_u16_s16 ++ ++uint16x8_t vreinterpretq_u16_s8 (int8x16_t t); ++#define vreinterpretq_u16_s8 ++ ++uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t); ++#define vreinterpretq_u16_u64 ++ ++uint16x8_t vreinterpretq_u16_s64 (int64x2_t t); ++#define vreinterpretq_u16_s64 ++ ++uint16x8_t vreinterpretq_u16_f32 (float32x4_t t); ++#define vreinterpretq_u16_f32(t) _M128i(t) ++ ++uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t); ++#define vreinterpretq_u16_p16 ++ ++uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t); ++#define vreinterpretq_u16_p8 ++ ++uint32x2_t vreinterpret_u32_u16 (uint16x4_t t); ++#define vreinterpret_u32_u16 ++ ++uint32x2_t vreinterpret_u32_u8 (uint8x8_t t); ++#define vreinterpret_u32_u8 ++ ++uint32x2_t vreinterpret_u32_s32 (int32x2_t t); ++#define vreinterpret_u32_s32 ++ ++uint32x2_t vreinterpret_u32_s16 (int16x4_t t); ++#define vreinterpret_u32_s16 ++ ++uint32x2_t vreinterpret_u32_s8 (int8x8_t t); ++#define vreinterpret_u32_s8 ++ ++uint32x2_t vreinterpret_u32_u64 (uint64x1_t t); ++#define vreinterpret_u32_u64 ++ ++uint32x2_t vreinterpret_u32_s64 (int64x1_t t); ++#define vreinterpret_u32_s64 ++ ++uint32x2_t vreinterpret_u32_f32 (float32x2_t t); ++#define vreinterpret_u32_f32 ++ ++uint32x2_t vreinterpret_u32_p16 (poly16x4_t t); ++#define vreinterpret_u32_p16 ++ ++uint32x2_t vreinterpret_u32_p8 (poly8x8_t t); ++#define vreinterpret_u32_p8 ++ ++uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t); ++#define vreinterpretq_u32_u16 ++ ++uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t); ++#define vreinterpretq_u32_u8 ++ ++uint32x4_t vreinterpretq_u32_s32 (int32x4_t t); ++#define vreinterpretq_u32_s32 ++ ++uint32x4_t vreinterpretq_u32_s16 (int16x8_t t); ++#define vreinterpretq_u32_s16 ++ ++uint32x4_t vreinterpretq_u32_s8 (int8x16_t t); ++#define vreinterpretq_u32_s8 ++ ++uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t); ++#define vreinterpretq_u32_u64 ++ ++uint32x4_t vreinterpretq_u32_s64 (int64x2_t t); ++#define vreinterpretq_u32_s64 ++ ++uint32x4_t vreinterpretq_u32_f32 (float32x4_t t); ++#define vreinterpretq_u32_f32(t) _M128i(t) ++ ++uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t); ++#define vreinterpretq_u32_p16 ++ ++uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); ++#define vreinterpretq_u32_p8 ++ ++#endif /* NEON2SSE_H */ +diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h +index fee33a3..22fb2ce 100644 +--- a/gcc/config/i386/gnu-user.h ++++ b/gcc/config/i386/gnu-user.h +@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see + When the -shared link option is used a final link is not being + done. */ + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ " -mssse3 -fno-short-enums " \ ++ + #undef ASM_SPEC + #define ASM_SPEC \ +- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" ++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + #undef SUBTARGET_EXTRA_SPECS + #define SUBTARGET_EXTRA_SPECS \ +diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h +index 7a02a7e..cac4179 100644 +--- a/gcc/config/i386/gnu-user64.h ++++ b/gcc/config/i386/gnu-user64.h +@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define SPEC_X32 "mx32" + #endif + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ "%{m32:-mssse3 -fno-short-enums}" \ ++ "%{!m32:-msse4.2 -mpopcnt}" ++ + #undef ASM_SPEC + #define ASM_SPEC "%{" SPEC_32 ":--32} \ + %{" SPEC_64 ":--64} \ +diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c +index 3d044e8..5c89fca 100644 +--- a/gcc/config/i386/i386.c ++++ b/gcc/config/i386/i386.c +@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp) + else if (!SYMBOL_REF_FAR_ADDR_P (op0) + && (SYMBOL_REF_LOCAL_P (op0) + || (HAVE_LD_PIE_COPYRELOC ++ && !TARGET_HAS_BIONIC + && flag_pie + && !SYMBOL_REF_WEAK (op0) + && !SYMBOL_REF_FUNCTION_P (op0))) +diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h +index 4b9910f..c33e074 100644 +--- a/gcc/config/i386/linux-common.h ++++ b/gcc/config/i386/linux-common.h +@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see + #undef CC1_SPEC + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC \ ++ ANDROID_TARGET_CC1_SPEC \ ++ " " \ ++ ANDROID_CC1_SPEC("-fPIC")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + + #undef LINK_SPEC + #define LINK_SPEC \ +diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h +index a1f98d3..3725799 100644 +--- a/gcc/config/i386/pmm_malloc.h ++++ b/gcc/config/i386/pmm_malloc.h +@@ -31,7 +31,7 @@ + #ifndef __cplusplus + extern int posix_memalign (void **, size_t, size_t); + #else +-extern "C" int posix_memalign (void **, size_t, size_t) throw (); ++extern "C" int posix_memalign (void **, size_t, size_t); + #endif + + static __inline void * +diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h +index 301a41c..bebfe7f 100644 +--- a/gcc/config/linux-android.h ++++ b/gcc/config/linux-android.h +@@ -38,15 +38,18 @@ + "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}" + + #define ANDROID_LINK_SPEC \ +- "%{shared: -Bsymbolic}" ++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now" + +-#define ANDROID_CC1_SPEC \ ++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \ + "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \ +- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}" ++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}" + + #define ANDROID_CC1PLUS_SPEC \ +- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \ +- "%{!frtti:%{!fno-rtti: -fno-rtti}}" ++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \ ++ "%{!frtti:%{!fno-rtti: -frtti}}" ++ ++#define ANDROID_ASM_SPEC \ ++ "--noexecstack" + + #define ANDROID_LIB_SPEC \ + "%{!static: -ldl}" +diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h +new file mode 100644 +index 0000000..32c539c +--- /dev/null ++++ b/gcc/config/mips/android.h +@@ -0,0 +1,49 @@ ++/* Target macros for mips*-*android* targets. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; either version 3, or (at your option) ++any later version. ++ ++GCC is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#undef DRIVER_SELF_SPECS ++#define DRIVER_SELF_SPECS \ ++ /* Make sure a -mips option is present. This helps us to pick \ ++ the right multilib, and also makes the later specs easier \ ++ to write. */ \ ++ MIPS_ISA_LEVEL_SPEC, \ ++ \ ++ /* Infer the default float setting from -march. */ \ ++ MIPS_ARCH_FLOAT_SPEC, \ ++ \ ++ /* Infer the -msynci setting from -march if not explicitly set. */ \ ++ MIPS_ISA_SYNCI_SPEC, \ ++ \ ++ /* If no ABI option is specified, infer one from the ISA level \ ++ or -mgp setting. */ \ ++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \ ++ \ ++ /* If no FP ABI option is specified, infer one from the \ ++ ABI/ISA level unless there is a conflicting option. */ \ ++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \ ++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \ ++ \ ++ /* If no odd-spreg option is specified, infer one from the ISA. */ \ ++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \ ++ \ ++ /* Base SPECs. */ \ ++ BASE_DRIVER_SELF_SPECS, \ ++ \ ++ /* Use the standard linux specs for everything else. */ \ ++ LINUX_DRIVER_SELF_SPECS +diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h +index 15b549c..4a28160 100644 +--- a/gcc/config/mips/gnu-user.h ++++ b/gcc/config/mips/gnu-user.h +@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see + /* The GNU C++ standard library requires this. */ \ + if (c_dialect_cxx ()) \ + builtin_define ("_GNU_SOURCE"); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ + } while (0) + + #undef SUBTARGET_CPP_SPEC +@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see + + #undef SUBTARGET_ASM_SPEC + #define SUBTARGET_ASM_SPEC \ +- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}" ++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + /* The MIPS assembler has different syntax for .set. We set it to + .dummy to trap any errors. */ +@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); + #endif + + #define LINUX_DRIVER_SELF_SPECS \ +- NO_SHARED_SPECS \ ++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \ + MARCH_MTUNE_NATIVE_SPECS, \ + /* -mplt has no effect without -mno-shared. Simplify later \ + specs handling by removing a redundant option. */ \ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8429a7c..4b0825d 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see + #undef SUBTARGET_CC1_SPEC + #define SUBTARGET_CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) + + #undef CC1PLUS_SPEC + #define CC1PLUS_SPEC \ +@@ -62,3 +62,7 @@ along with GCC; see the file COPYING3. If not see + + /* The default value isn't sufficient in 64-bit mode. */ + #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) ++ ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif +diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android +new file mode 100644 +index 0000000..39f512c +--- /dev/null ++++ b/gcc/config/mips/t-linux-android +@@ -0,0 +1,3 @@ ++MULTILIB_OPTIONS = mips32r2/mips32r6 ++MULTILIB_DIRNAMES = mips-r2 mips-r6 ++MULTILIB_OSDIRNAMES = ../libr2 ../libr6 +diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64 +new file mode 100644 +index 0000000..55cab7d +--- /dev/null ++++ b/gcc/config/mips/t-linux-android64 +@@ -0,0 +1,4 @@ ++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6 ++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6 ++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64 ++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6 +diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h +index 37ecfc4..a5f1b99 100644 +--- a/gcc/config/openbsd.h ++++ b/gcc/config/openbsd.h +@@ -136,8 +136,12 @@ while (0) + #define LIB_SPEC OBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LIB_SPEC + #define LIB_SPEC OBSD_LIB_SPEC +diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h +index cbf9097..eb2217f 100644 +--- a/gcc/config/rs6000/sysv4.h ++++ b/gcc/config/rs6000/sysv4.h +@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN) + -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}" + + #if defined(HAVE_LD_EH_FRAME_HDR) +-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# ifdef USE_EH_FRAME_HDR_FOR_STATIC ++# define LINK_EH_SPEC "--eh-frame-hdr " ++# else ++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# endif + #endif + + #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \ +diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h +index 5160e1f..7632a50 100644 +--- a/gcc/config/sol2.h ++++ b/gcc/config/sol2.h +@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see + /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs + --eh-frame-hdr to create the required .eh_frame_hdr sections. */ + #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++#endif + #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */ + #endif + +diff --git a/gcc/configure b/gcc/configure +index 1c6e340..28ad050 100755 +--- a/gcc/configure ++++ b/gcc/configure +@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769 + enable_fix_cortex_a53_843419 + with_glibc_version + enable_gnu_unique_object ++enable_eh_frame_hdr_for_static + enable_linker_build_id + enable_default_ssp + with_long_double_128 +@@ -1670,6 +1671,9 @@ Optional Features: + --enable-gnu-unique-object + enable the use of the @gnu_unique_object ELF + extension on glibc systems ++ --enable-eh-frame-hdr-for-static ++ enable linker PT_GNU_EH_FRAME support for static ++ executable + --enable-linker-build-id + compiler will always pass --build-id to linker + --enable-default-ssp enable Stack Smashing Protection as default +@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + + $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h + ++ # Check whether --enable-eh-frame-hdr-for-static was given. ++if test "${enable_eh_frame_hdr_for_static+set}" = set; then : ++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;; ++ esac ++else ++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi ++fi ++ ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ ++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h ++ ++ fi + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5 + $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; } +diff --git a/gcc/configure.ac b/gcc/configure.ac +index 6c1dcd9..0cf7419 100644 +--- a/gcc/configure.ac ++++ b/gcc/configure.ac +@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR]) + if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1, + [Define if your linker supports .eh_frame_hdr.]) ++ AC_ARG_ENABLE(eh-frame-hdr-for-static, ++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static], ++ [enable linker PT_GNU_EH_FRAME support for static executable])], ++ [case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'.]) ;; ++ esac], ++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ [[if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi]]) ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1, ++[Define if your system supports PT_GNU_EH_FRAME for static executable.]) ++ fi + fi + AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr) + +diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C +new file mode 100644 +index 0000000..15408ef +--- /dev/null ++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C +@@ -0,0 +1,25 @@ ++// PR c++/4381 ++// Test that exception-specs work properly for classes with virtual bases. ++ ++// { dg-do run } ++// { dg-options "-static" } ++ ++class Base {}; ++ ++struct A : virtual public Base ++{ ++ A() {} ++}; ++ ++struct B {}; ++ ++void func() throw (B,A) ++{ ++ throw A(); ++} ++ ++int main(void) ++{ ++ try { func(); } ++ catch (A& a) { } ++} +diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c +index f3343fc..d426477 100644 +--- a/libgcc/crtstuff.c ++++ b/libgcc/crtstuff.c +@@ -88,7 +88,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(BSD_DL_ITERATE_PHDR_AVAILABLE) + #include + # define USE_PT_GNU_EH_FRAME +@@ -97,7 +98,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__sun__) && defined(__svr4__) + #include + # define USE_PT_GNU_EH_FRAME +@@ -106,7 +108,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__GLIBC__) && __GLIBC__ >= 2 + #include + /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h. +@@ -121,7 +124,7 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(CRTSTUFFT_O) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(inhibit_libc) \ + && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__)) + /* On systems using glibc, an inhibit_libc build of libgcc is only +diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h +index 555c0fe..47c8655 100644 +--- a/libgcc/gthr-posix.h ++++ b/libgcc/gthr-posix.h +@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define __GTHREADS 1 + #define __GTHREADS_CXX0X 1 + ++/* The following should normally be in a different header file, ++ * but I couldn't find the right location. The point of the macro ++ * definition below is to prevent libsupc++ and libstdc++ to reference ++ * weak symbols in their static C++ constructors. Such code crashes ++ * when a shared object linked statically to these libraries is ++ * loaded on Android 2.1 (Eclair) and older platform releases, due ++ * to a dynamic linker bug. ++ */ ++#ifdef __ANDROID__ ++#undef GTHREAD_USE_WEAK ++#define GTHREAD_USE_WEAK 0 ++#endif ++ + #include + + #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \ +diff --git a/libgcc/gthr.h b/libgcc/gthr.h +index 47a7d06..67a680f 100644 +--- a/libgcc/gthr.h ++++ b/libgcc/gthr.h +@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define GTHREAD_USE_WEAK 1 + #endif + #endif ++#if __ANDROID__ ++#include "gthr-posix.h" ++#else + #include "gthr-default.h" ++#endif + + #ifndef HIDE_EXPORTS + #pragma GCC visibility pop +diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure +index 41797a9..5a631dd 100755 +--- a/libstdc++-v3/configure ++++ b/libstdc++-v3/configure +@@ -78319,6 +78319,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +@@ -78377,6 +78383,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h +index e3e206b..e85dc2c 100644 +--- a/libstdc++-v3/include/bits/locale_facets.h ++++ b/libstdc++-v3/include/bits/locale_facets.h +@@ -47,6 +47,20 @@ + #include + #include + ++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__ ++// CrystaX: for some reason, x86 gcc-4.9 makes ctype::do_widen() and ++// ctype::_M_widen_init() methods working wrong if optimization enabled. ++// For ctype::do_widen(), values of passed arguments (__lo, __hi and __to) ++// are completely messed up and don't correspond to passed values. In case if ++// we disable optimization for those methods, things become correct so we apply ++// this workaround here for a time. ++// TODO: figure out what exactly wrong here - is it bug in GCC optimization ++// algorithm or smth else? ++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0))) ++#else ++#define __CRYSTAX_X86_DONT_OPTIMIZE ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + * @return @a __hi. + */ + virtual const char* +- do_widen(const char* __lo, const char* __hi, char_type* __to) const ++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE + { + __builtin_memcpy(__to, __lo, __hi - __lo); + return __hi; +@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + + private: + void _M_narrow_init() const; +- void _M_widen_init() const; ++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE; + }; + + #ifdef _GLIBCXX_USE_WCHAR_T +diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc +index 9b61799..c149169 100644 +--- a/libstdc++-v3/libsupc++/guard.cc ++++ b/libstdc++-v3/libsupc++/guard.cc +@@ -33,7 +33,12 @@ + #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \ + && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX) + # include ++#if defined(__ANDROID__) ++# include ++# define SYS_futex __NR_futex ++#else + # include ++#endif + # include + # define _GLIBCXX_USE_FUTEX + # define _GLIBCXX_FUTEX_WAIT 0 diff --git a/scripts/build/arch/arm.sh b/scripts/build/arch/arm.sh index eeffc7d..8733ac3 100644 --- a/scripts/build/arch/arm.sh +++ b/scripts/build/arch/arm.sh @@ -18,6 +18,7 @@ CT_DoArchTupleValues() { *glibc,y) CT_TARGET_SYS=gnueabi;; uClibc,y) CT_TARGET_SYS=uclibc${CT_LIBC_UCLIBC_USE_GNU_SUFFIX:+gnu}eabi;; musl,y) CT_TARGET_SYS=musleabi;; + bionic,y) CT_TARGET_SYS=android;; *,y) CT_TARGET_SYS=eabi;; esac diff --git a/scripts/build/libc/bionic.sh b/scripts/build/libc/bionic.sh new file mode 100644 index 0000000..fcdc57f --- /dev/null +++ b/scripts/build/libc/bionic.sh @@ -0,0 +1,43 @@ +# This file adds functions to extract the bionic C library from the Android NDK +# Copyright 2017 Howard Chu +# Licensed under the GPL v2. See COPYING in the root of this package + +do_libc_get() { + if [ "${CT_LIBC_BIONIC_CUSTOM}" = "y" ]; then + CT_GetCustom "bionic" "${CT_LIBC_BIONIC_CUSTOM_VERSION}" \ + "${CT_LIBC_BIONIC_CUSTOM_LOCATION}" + else # ! custom location + CT_GetFile "android-ndk-${CT_LIBC_VERSION}-linux-x86_64.zip" https://dl.google.com/android/repository + fi # ! custom location +} + +do_libc_extract() { + CT_Extract "android-ndk-${CT_LIBC_VERSION}-linux-x86_64" + CT_Pushd "${CT_SRC_DIR}/android-ndk-${CT_LIBC_VERSION}/" + CT_Patch nochdir bionic "${CT_LIBC_VERSION}" + CT_Popd +} + +# Install Unified headers +do_libc_start_files() { + CT_DoStep INFO "Installing C library headers" + CT_DoExecLog ALL cp -r "${CT_SRC_DIR}/android-ndk-${CT_LIBC_VERSION}/sysroot/usr" "${CT_SYSROOT_DIR}" +} + +do_libc() { + local arch="${CT_ARCH}" + if [ "${CT_ARCH_64}" = "y" ]; then + if [ "${CT_ARCH}" = "x86" ]; then + arch="${arch}_" + fi + arch="${arch}64" + fi + CT_DoStep INFO "Installing C library binaries" + CT_DoExecLog ALL cp -r "${CT_SRC_DIR}/android-ndk-${CT_LIBC_VERSION}/platforms/android-${CT_ANDROID_API}/arch-${arch}/usr" "${CT_SYSROOT_DIR}" + CT_EnvModify CT_TARGET_CFLAGS "${CT_TARGET_CFLAGS} -D__ANDROID_API__=${CT_ANDROID_API}" +} + +do_libc_post_cc() { + : +} + diff --git a/scripts/functions b/scripts/functions index 39ec2b4..c1b99ce 100644 --- a/scripts/functions +++ b/scripts/functions @@ -1368,6 +1368,7 @@ CT_DoBuildTargetTuple() { *glibc) CT_TARGET_SYS=gnu;; uClibc) CT_TARGET_SYS=uclibc;; musl) CT_TARGET_SYS=musl;; + bionic) CT_TARGET_SYS=android;; avr-libc) # avr-libc only seems to work with the non-canonical "avr" target. CT_TARGET_SKIP_CONFIG_SUB=y -- cgit v0.10.2-6-g49f6 From a6e11562c46d9228ad70a84f7f0031b342078944 Mon Sep 17 00:00:00 2001 From: hyc Date: Wed, 26 Apr 2017 03:04:07 +0100 Subject: Tweaks for bionic/TLS diff --git a/config/cc/gcc.in.2 b/config/cc/gcc.in.2 index c147f05..963cb2a 100644 --- a/config/cc/gcc.in.2 +++ b/config/cc/gcc.in.2 @@ -94,6 +94,7 @@ config CC_GCC_SYSTEM_ZLIB config CC_GCC_CONFIG_TLS tristate prompt "Configure TLS (Thread Local Storage)" + depends on !LIBC_bionic default m help Specify that the target supports TLS (Thread Local Storage). Usually diff --git a/config/libc/bionic.in b/config/libc/bionic.in index 20697ad..b6b7af5 100644 --- a/config/libc/bionic.in +++ b/config/libc/bionic.in @@ -5,7 +5,9 @@ ## ## select LIBC_SUPPORT_THREADS_POSIX ## -## help Bionic is the Android C library. It is prebuilt. +## help Bionic is the Android C library. It is prebuilt, extracted from the Android NDK. +## help This platform has no TLS (Thread Local Storage) support so that option must be +## help disabled in the Compiler options. config THREADS default "posix" -- cgit v0.10.2-6-g49f6 From 925d0490dba077672ee3da2c97337b2ef75dd33e Mon Sep 17 00:00:00 2001 From: hyc Date: Wed, 26 Apr 2017 03:04:29 +0100 Subject: Fix for ctype change in NDK 14 diff --git a/patches/gcc/linaro-6.3-2017.02/951-bionic-ndk.patch b/patches/gcc/linaro-6.3-2017.02/951-bionic-ndk.patch new file mode 100644 index 0000000..59c50a8 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/951-bionic-ndk.patch @@ -0,0 +1,58 @@ +commit d38d37bdfe24b7ce1bdcb55642fb6b904718e68f +Author: Howard Chu +Date: Tue Apr 25 19:02:18 2017 -0700 + + Fix ctype for newer NDK headers + +diff --git a/libstdc++-v3/config/os/bionic/ctype_base.h b/libstdc++-v3/config/os/bionic/ctype_base.h +index 33978f3..c36e63c 100644 +--- a/libstdc++-v3/config/os/bionic/ctype_base.h ++++ b/libstdc++-v3/config/os/bionic/ctype_base.h +@@ -28,6 +28,18 @@ + + // Information as gleaned from /usr/include/ctype.h + ++// _CTYPE prefix was added in NDK r14 unified headers ++#ifndef _CTYPE_U ++#define _CTYPE_U _U ++#define _CTYPE_L _L ++#define _CTYPE_D _N ++#define _CTYPE_S _S ++#define _CTYPE_P _P ++#define _CTYPE_C _C ++#define _CTYPE_X _X ++#define _CTYPE_B _B ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -41,17 +53,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + // NB: Offsets into ctype::_M_table force a particular size + // on the mask type. Because of this, we don't use an enum. + typedef char mask; +- static const mask upper = _U; +- static const mask lower = _L; +- static const mask alpha = _U | _L; +- static const mask digit = _N; +- static const mask xdigit = _X | _N; +- static const mask space = _S; +- static const mask print = _P | _U | _L | _N | _B; +- static const mask graph = _P | _U | _L | _N; +- static const mask cntrl = _C; +- static const mask punct = _P; +- static const mask alnum = _U | _L | _N; ++ static const mask upper = _CTYPE_U; ++ static const mask lower = _CTYPE_L; ++ static const mask alpha = _CTYPE_U | _CTYPE_L; ++ static const mask digit = _CTYPE_D; ++ static const mask xdigit = _CTYPE_X | _CTYPE_D; ++ static const mask space = _CTYPE_S; ++ static const mask print = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D | _CTYPE_B; ++ static const mask graph = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D; ++ static const mask cntrl = _CTYPE_C; ++ static const mask punct = _CTYPE_P; ++ static const mask alnum = _CTYPE_U | _CTYPE_L | _CTYPE_D; + #if __cplusplus >= 201103L + static const mask blank = space; + #endif -- cgit v0.10.2-6-g49f6 From 9062810a11b1d1a347c16e7cfd4cac31ed9427b3 Mon Sep 17 00:00:00 2001 From: hyc Date: Wed, 26 Apr 2017 04:11:57 +0100 Subject: Fix type mismatch in std::swap() args diff --git a/patches/gcc/linaro-6.3-2017.02/952-bionic-errno.patch b/patches/gcc/linaro-6.3-2017.02/952-bionic-errno.patch new file mode 100644 index 0000000..91f6ca3 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/952-bionic-errno.patch @@ -0,0 +1,19 @@ +commit 6cd4ad106ef87a3c58b0c3478e78409b47000de0 +Author: Howard Chu +Date: Tue Apr 25 20:17:03 2017 -0700 + + Fix, errno is volatile int + +diff --git a/libstdc++-v3/src/filesystem/dir.cc b/libstdc++-v3/src/filesystem/dir.cc +index 6ff12d0..5bbd664 100644 +--- a/libstdc++-v3/src/filesystem/dir.cc ++++ b/libstdc++-v3/src/filesystem/dir.cc +@@ -147,7 +147,7 @@ fs::_Dir::advance(error_code* ec, directory_options options) + + int err = std::exchange(errno, 0); + const auto entp = readdir(dirp); +- std::swap(errno, err); ++ std::swap((int&)errno, err); + + if (entp) + { -- cgit v0.10.2-6-g49f6 From a776367937aa4f6ff03f0d3fdecb90d13fdaab14 Mon Sep 17 00:00:00 2001 From: hyc Date: Wed, 26 Apr 2017 09:23:07 +0100 Subject: Add sample Android config, gcc 6.3 patches diff --git a/patches/gcc/6.3.0/950-bionic-android.patch b/patches/gcc/6.3.0/950-bionic-android.patch new file mode 100644 index 0000000..1ae1186 --- /dev/null +++ b/patches/gcc/6.3.0/950-bionic-android.patch @@ -0,0 +1,17570 @@ +Based on https://github.com/crystax/android-toolchain-gcc-6/commits/master + +Up to commit e510dbdc79714512ac87126f1832f366d56623b6 but with +Crystax-specific lib support omitted +diff --git a/gcc/config.gcc b/gcc/config.gcc +index f66e48c..4380657 100644 +--- a/gcc/config.gcc ++++ b/gcc/config.gcc +@@ -374,6 +374,7 @@ i[34567]86-*-*) + rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h + adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h + avx512cdintrin.h avx512erintrin.h avx512pfintrin.h ++ arm_neon.h + shaintrin.h clflushoptintrin.h xsavecintrin.h + xsavesintrin.h avx512dqintrin.h avx512bwintrin.h + avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h +@@ -396,6 +397,7 @@ x86_64-*-*) + rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h + adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h + avx512cdintrin.h avx512erintrin.h avx512pfintrin.h ++ arm_neon.h + shaintrin.h clflushoptintrin.h xsavecintrin.h + xsavesintrin.h avx512dqintrin.h avx512bwintrin.h + avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h +@@ -942,13 +944,17 @@ aarch64*-*-elf | aarch64*-*-rtems*) + TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'` + ;; + aarch64*-*-linux*) +- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h" ++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h" + tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h" ++ extra_options="${extra_options} linux-android.opt" + tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux" + case $target in + aarch64_be-*) + tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1" + ;; ++ aarch64*-*-linux-android*) ++ tm_file="${tm_file} aarch64/aarch64-linux-android.h" ++ ;; + esac + aarch64_multilibs="${with_multilib_list}" + if test "$aarch64_multilibs" = "default"; then +@@ -2055,6 +2061,17 @@ mips*-*-linux*) # Linux MIPS, either endian. + tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h" + extra_options="${extra_options} linux-android.opt" + case ${target} in ++ mips64*android*) ++ default_mips_arch=mips64r6 ++ default_mips_abi=64 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="${tmake_file} mips/t-linux-android64" ++ ;; ++ mips*android*) ++ default_mips_arch=mips32 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="$tmake_file mips/t-linux-android" ++ ;; + mipsisa32r6*) + default_mips_arch=mips32r6 + ;; +diff --git a/gcc/config.in b/gcc/config.in +index 115cb61..9339168 100644 +--- a/gcc/config.in ++++ b/gcc/config.in +@@ -2119,6 +2119,12 @@ + #endif + + ++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */ ++#ifndef USED_FOR_TARGET ++#undef USE_EH_FRAME_HDR_FOR_STATIC ++#endif ++ ++ + /* Define to 1 if the 'long long' type is wider than 'long' but still + efficiently supported by the host hardware. */ + #ifndef USED_FOR_TARGET +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +new file mode 100644 +index 0000000..cffd84d +--- /dev/null ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -0,0 +1,63 @@ ++/* Machine description for AArch64 architecture. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ GCC is distributed in the hope that it will be useful, but ++ WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING3. If not see ++ . */ ++ ++#ifndef GCC_AARCH64_LINUX_ANDROID_H ++#define GCC_AARCH64_LINUX_ANDROID_H ++ ++ ++#undef TARGET_OS_CPP_BUILTINS ++#define TARGET_OS_CPP_BUILTINS() \ ++ do \ ++ { \ ++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ ++ } \ ++ while (0) ++ ++#undef LINK_SPEC ++#define LINK_SPEC \ ++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ ++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) ++ ++#undef CC1_SPEC ++#define CC1_SPEC \ ++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) ++ ++#undef LIB_SPEC ++#define LIB_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ ++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC) ++ ++#undef STARTFILE_SPEC ++#define STARTFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) ++ ++#undef ENDFILE_SPEC ++#define ENDFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) ++ ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif ++ ++#endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h +index 5fcaa59..6864195 100644 +--- a/gcc/config/aarch64/aarch64-linux.h ++++ b/gcc/config/aarch64/aarch64-linux.h +@@ -21,7 +21,14 @@ + #ifndef GCC_AARCH64_LINUX_H + #define GCC_AARCH64_LINUX_H + +-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifndef RUNTIME_ROOT_PREFIX ++#define RUNTIME_ROOT_PREFIX "" ++#endif ++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifdef BIONIC_DYNAMIC_LINKER ++#undef BIONIC_DYNAMIC_LINKER ++#endif ++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64" + + #undef MUSL_DYNAMIC_LINKER + #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" +diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h +index 093c38b..54b3e0c 100644 +--- a/gcc/config/alpha/elf.h ++++ b/gcc/config/alpha/elf.h +@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number; + I imagine that other systems will catch up. In the meantime, it + doesn't harm to make sure that the data exists to be used later. */ + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif +diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c +index 5974c65..71b2c7a 100644 +--- a/gcc/config/arm/arm.c ++++ b/gcc/config/arm/arm.c +@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code) + memsize = MEM_SIZE (x); + + /* Only certain alignment specifiers are supported by the hardware. */ +- if (memsize == 32 && (align % 32) == 0) ++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC ++ honors stricter alignment of composite type in user code, it doesn't ++ observe the alignment of memory passed as an extra argument for function ++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */ ++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED) + align_bits = 256; +- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0) ++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED) + align_bits = 128; + else if (memsize >= 8 && (align % 8) == 0) + align_bits = 64; +diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h +index ad123dd..97b059d 100644 +--- a/gcc/config/arm/arm.h ++++ b/gcc/config/arm/arm.h +@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes + + #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ + || (TARGET_THUMB1 \ ++ && !inline_thumb1_jump_table \ + && (optimize_size || flag_pic))) + + #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ +- (TARGET_THUMB1 \ ++ (TARGET_THUMB1 && !inline_thumb1_jump_table \ + ? (min >= 0 && max < 512 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ + : min >= -256 && max < 256 \ +diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md +index 47171b9..eb22d11 100644 +--- a/gcc/config/arm/arm.md ++++ b/gcc/config/arm/arm.md +@@ -8179,7 +8179,7 @@ + (match_operand:SI 2 "const_int_operand" "") ; total range + (match_operand:SI 3 "" "") ; table label + (match_operand:SI 4 "" "")] ; Out of range label +- "TARGET_32BIT || optimize_size || flag_pic" ++ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)" + " + { + enum insn_code code; +diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt +index 0ebe017..7724538 100644 +--- a/gcc/config/arm/arm.opt ++++ b/gcc/config/arm/arm.opt +@@ -193,6 +193,10 @@ mthumb-interwork + Target Report Mask(INTERWORK) + Support calls between Thumb and ARM instruction sets. + ++minline-thumb1-jumptable ++Target Report Var(inline_thumb1_jump_table) ++Inline Thumb1 Jump table code ++ + mtls-dialect= + Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU) + Specify thread local storage scheme. +diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h +index 77f3055..32158ed 100644 +--- a/gcc/config/arm/elf.h ++++ b/gcc/config/arm/elf.h +@@ -56,8 +56,7 @@ + #undef SUBSUBTARGET_EXTRA_SPECS + #define SUBSUBTARGET_EXTRA_SPECS + +-#ifndef ASM_SPEC +-#define ASM_SPEC "\ ++#define DEFAULT_ASM_SPEC "\ + %{mbig-endian:-EB} \ + %{mlittle-endian:-EL} \ + %(asm_cpu_spec) \ +@@ -66,6 +65,9 @@ + %{mthumb-interwork:-mthumb-interwork} \ + %{mfloat-abi=*} %{mfpu=*} \ + %(subtarget_extra_asm_spec)" ++ ++#ifndef ASM_SPEC ++#define ASM_SPEC DEFAULT_ASM_SPEC + #endif + + /* The ARM uses @ are a comment character so we need to redefine +@@ -104,7 +106,8 @@ + the code more efficient, but for Thumb-1 it's better to put them out of + band unless we are generating compressed tables. */ + #define JUMP_TABLES_IN_TEXT_SECTION \ +- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) ++ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \ ++ && (optimize_size || flag_pic))) + + #ifndef LINK_SPEC + #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" +diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h +index ace8481..ed7f4d0 100644 +--- a/gcc/config/arm/linux-eabi.h ++++ b/gcc/config/arm/linux-eabi.h +@@ -108,11 +108,16 @@ + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \ + GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \ +- ANDROID_CC1_SPEC) ++ ANDROID_CC1_SPEC("-fpic")) + + #define CC1PLUS_SPEC \ + LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + ++#undef ASM_SPEC ++#define ASM_SPEC \ ++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \ ++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC) ++ + #undef LIB_SPEC + #define LIB_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ +diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi +index 8f1307c..cbbec5b 100644 +--- a/gcc/config/arm/t-linux-androideabi ++++ b/gcc/config/arm/t-linux-androideabi +@@ -1,8 +1,9 @@ +-MULTILIB_OPTIONS = march=armv7-a mthumb +-MULTILIB_DIRNAMES = armv7-a thumb +-MULTILIB_EXCEPTIONS = ++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard ++MULTILIB_DIRNAMES = armv7-a thumb hard ++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard* + MULTILIB_MATCHES = + MULTILIB_OSDIRNAMES = ++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch + + # The "special" multilib can be used to build native applications for Android, + # as opposed to native shared libraries that are then called via JNI. +diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h +index 5ded869..5f51ac8 100644 +--- a/gcc/config/freebsd.h ++++ b/gcc/config/freebsd.h +@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see + #define LIB_SPEC FBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #ifdef TARGET_LIBC_PROVIDES_SSP + #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \ +diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h +index b0bf40a..d1874bc 100644 +--- a/gcc/config/gnu-user.h ++++ b/gcc/config/gnu-user.h +@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LINK_GCC_C_SEQUENCE_SPEC + #define LINK_GCC_C_SEQUENCE_SPEC \ +diff --git a/gcc/config/i386/arm_neon.h b/gcc/config/i386/arm_neon.h +new file mode 100644 +index 0000000..10944f3 +--- /dev/null ++++ b/gcc/config/i386/arm_neon.h +@@ -0,0 +1,16622 @@ ++//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com ++ ++//*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved. ++ ++//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. ++ ++//By downloading, copying, installing or using the software you agree to this license. ++//If you do not agree to this license, do not download, install, copy or use the software. ++ ++// License Agreement ++ ++//Permission to use, copy, modify, and/or distribute this software for any ++//purpose with or without fee is hereby granted, provided that the above ++//copyright notice and this permission notice appear in all copies. ++ ++//THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH ++//REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY ++//AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, ++//INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM ++//LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR ++//OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR ++//PERFORMANCE OF THIS SOFTWARE. ++ ++//***************************************************************************************** ++// This file is intended to simplify ARM->IA32 porting ++// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h") ++// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below ++// MMX instruction set is not used due to performance overhead and the necessity to use the ++// EMMS instruction (_mm_empty())for mmx-x87 floating point switching ++//***************************************************************************************** ++ ++//!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual. ++//!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for ++//!!!!!!! greater performance. It can be done by -msse4.2 compiler switch. ++ ++#ifndef NEON2SSE_H ++#define NEON2SSE_H ++ ++#ifndef USE_SSE4 ++#if defined(__SSE4_2__) ++ #define USE_SSE4 ++#endif ++#endif ++ ++#include //SSE ++#include //SSE2 ++#include //SSE3 ++#include //SSSE3 ++#ifdef USE_SSE4 ++#include //SSE4.1 ++#include //SSE4.2 ++#endif ++ ++ ++//*************** functions and data attributes, compiler dependent ********************************* ++//*********************************************************************************** ++#ifdef __GNUC__ ++#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) ++#define _NEON2SSE_ALIGN_16 __attribute__((aligned(16))) ++#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) ++#if _GCC_VERSION < 40500 ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function ++#else ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function ++#endif ++#if defined(__x86_64__) ++ #define _NEON2SSE_64BIT __x86_64__ ++#endif ++#else ++#define _NEON2SSE_ALIGN_16 __declspec(align(16)) ++#define _NEON2SSE_INLINE __inline ++#if defined(_MSC_VER)|| defined (__INTEL_COMPILER) ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function ++#if defined(_M_X64) ++ #define _NEON2SSE_64BIT _M_X64 ++#endif ++#else ++ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function ++#endif ++#endif ++ ++#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4) ++ #define _NEON2SSE_64BIT_SSE4 ++#endif ++ ++/*********************************************************************************************************************/ ++// data types conversion ++/*********************************************************************************************************************/ ++#if defined(_MSC_VER) && (_MSC_VER < 1300) ++ typedef signed char int8_t; ++ typedef unsigned char uint8_t; ++ typedef signed short int16_t; ++ typedef unsigned short uint16_t; ++ typedef signed int int32_t; ++ typedef unsigned int uint32_t; ++ typedef signed long long int64_t; ++ typedef unsigned long long uint64_t; ++#elif defined(_MSC_VER) ++ typedef signed __int8 int8_t; ++ typedef unsigned __int8 uint8_t; ++ typedef signed __int16 int16_t; ++ typedef unsigned __int16 uint16_t; ++ typedef signed __int32 int32_t; ++ typedef unsigned __int32 uint32_t; ++ ++ typedef signed long long int64_t; ++ typedef unsigned long long uint64_t; ++#else ++#include ++#include ++#endif ++ ++typedef union __m64_128 { ++ uint64_t m64_u64[1]; ++ float m64_f32[2]; ++ int8_t m64_i8[8]; ++ int16_t m64_i16[4]; ++ int32_t m64_i32[2]; ++ int64_t m64_i64[1]; ++ uint8_t m64_u8[8]; ++ uint16_t m64_u16[4]; ++ uint32_t m64_u32[2]; ++} __m64_128; ++ ++typedef __m64_128 int8x8_t; ++typedef __m64_128 uint8x8_t; ++typedef __m64_128 int16x4_t; ++typedef __m64_128 uint16x4_t; ++typedef __m64_128 int32x2_t; ++typedef __m64_128 uint32x2_t; ++typedef __m64_128 int64x1_t; ++typedef __m64_128 uint64x1_t; ++typedef __m64_128 poly8x8_t; ++typedef __m64_128 poly16x4_t; ++ ++typedef __m64_128 float32x2_t; ++typedef __m128 float32x4_t; ++ ++typedef __m128 float16x4_t; //not supported by IA, for compatibility ++typedef __m128 float16x8_t; //not supported by IA, for compatibility ++ ++typedef __m128i int8x16_t; ++typedef __m128i int16x8_t; ++typedef __m128i int32x4_t; ++typedef __m128i int64x2_t; ++typedef __m128i uint8x16_t; ++typedef __m128i uint16x8_t; ++typedef __m128i uint32x4_t; ++typedef __m128i uint64x2_t; ++typedef __m128i poly8x16_t; ++typedef __m128i poly16x8_t; ++ ++#if defined(_MSC_VER) ++ #define SINT_MIN (-2147483647 - 1) /* min signed int value */ ++ #define SINT_MAX 2147483647 /* max signed int value */ ++#else ++ #define SINT_MIN INT_MIN /* min signed int value */ ++ #define SINT_MAX INT_MAX /* max signed int value */ ++#endif ++ ++typedef float float32_t; ++#if !defined(__clang__) ++typedef float __fp16; ++#endif ++ ++typedef uint8_t poly8_t; ++typedef uint16_t poly16_t; ++ ++ ++//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in ++//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types ++struct int8x16x2_t { ++ int8x16_t val[2]; ++}; ++struct int16x8x2_t { ++ int16x8_t val[2]; ++}; ++struct int32x4x2_t { ++ int32x4_t val[2]; ++}; ++struct int64x2x2_t { ++ int64x2_t val[2]; ++}; ++//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!! ++struct int8x8x2_t { ++ int8x8_t val[2]; ++}; ++struct int16x4x2_t { ++ int16x4_t val[2]; ++}; ++struct int32x2x2_t { ++ int32x2_t val[2]; ++}; ++struct int64x1x2_t { ++ int64x1_t val[2]; ++}; ++ ++typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy ++typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy ++typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy ++typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy ++ ++typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy ++typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy ++typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy ++typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy ++ ++/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */ ++typedef struct int8x16x2_t uint8x16x2_t; ++typedef struct int16x8x2_t uint16x8x2_t; ++typedef struct int32x4x2_t uint32x4x2_t; ++typedef struct int64x2x2_t uint64x2x2_t; ++typedef struct int8x16x2_t poly8x16x2_t; ++typedef struct int16x8x2_t poly16x8x2_t; ++ ++typedef struct int8x8x2_t uint8x8x2_t; ++typedef struct int16x4x2_t uint16x4x2_t; ++typedef struct int32x2x2_t uint32x2x2_t; ++typedef struct int64x1x2_t uint64x1x2_t; ++typedef struct int8x8x2_t poly8x8x2_t; ++typedef struct int16x4x2_t poly16x4x2_t; ++ ++//float ++struct float32x4x2_t { ++ float32x4_t val[2]; ++}; ++struct float16x8x2_t { ++ float16x8_t val[2]; ++}; ++struct float32x2x2_t { ++ float32x2_t val[2]; ++}; ++ ++typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy ++typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy ++typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy ++typedef float16x8x2_t float16x4x2_t; ++ ++//4 ++struct int8x16x4_t { ++ int8x16_t val[4]; ++}; ++struct int16x8x4_t { ++ int16x8_t val[4]; ++}; ++struct int32x4x4_t { ++ int32x4_t val[4]; ++}; ++struct int64x2x4_t { ++ int64x2_t val[4]; ++}; ++ ++struct int8x8x4_t { ++ int8x8_t val[4]; ++}; ++struct int16x4x4_t { ++ int16x4_t val[4]; ++}; ++struct int32x2x4_t { ++ int32x2_t val[4]; ++}; ++struct int64x1x4_t { ++ int64x1_t val[4]; ++}; ++ ++typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy ++typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy ++typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy ++typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy ++ ++typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy ++typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy ++typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy ++typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy ++ ++/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ ++typedef struct int8x8x4_t uint8x8x4_t; ++typedef struct int16x4x4_t uint16x4x4_t; ++typedef struct int32x2x4_t uint32x2x4_t; ++typedef struct int64x1x4_t uint64x1x4_t; ++typedef struct int8x8x4_t poly8x8x4_t; ++typedef struct int16x4x4_t poly16x4x4_t; ++ ++typedef struct int8x16x4_t uint8x16x4_t; ++typedef struct int16x8x4_t uint16x8x4_t; ++typedef struct int32x4x4_t uint32x4x4_t; ++typedef struct int64x2x4_t uint64x2x4_t; ++typedef struct int8x16x4_t poly8x16x4_t; ++typedef struct int16x8x4_t poly16x8x4_t; ++ ++struct float32x4x4_t { ++ float32x4_t val[4]; ++}; ++struct float16x8x4_t { ++ float16x8_t val[4]; ++}; ++struct float32x2x4_t { ++ float32x2_t val[4]; ++}; ++ ++typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy ++typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy ++typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy ++typedef float16x8x4_t float16x4x4_t; ++ ++//3 ++struct int16x8x3_t { ++ int16x8_t val[3]; ++}; ++struct int32x4x3_t { ++ int32x4_t val[3]; ++}; ++struct int64x2x3_t { ++ int64x2_t val[3]; ++}; ++struct int8x16x3_t { ++ int8x16_t val[3]; ++}; ++ ++struct int16x4x3_t { ++ int16x4_t val[3]; ++}; ++struct int32x2x3_t { ++ int32x2_t val[3]; ++}; ++struct int64x1x3_t { ++ int64x1_t val[3]; ++}; ++struct int8x8x3_t { ++ int8x8_t val[3]; ++}; ++typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy ++typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy ++typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy ++typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy ++ ++typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy ++typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy ++typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy ++typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy ++ ++ ++/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ ++typedef struct int8x16x3_t uint8x16x3_t; ++typedef struct int16x8x3_t uint16x8x3_t; ++typedef struct int32x4x3_t uint32x4x3_t; ++typedef struct int64x2x3_t uint64x2x3_t; ++typedef struct int8x16x3_t poly8x16x3_t; ++typedef struct int16x8x3_t poly16x8x3_t; ++typedef struct int8x8x3_t uint8x8x3_t; ++typedef struct int16x4x3_t uint16x4x3_t; ++typedef struct int32x2x3_t uint32x2x3_t; ++typedef struct int64x1x3_t uint64x1x3_t; ++typedef struct int8x8x3_t poly8x8x3_t; ++typedef struct int16x4x3_t poly16x4x3_t; ++ ++//float ++struct float32x4x3_t { ++ float32x4_t val[3]; ++}; ++struct float32x2x3_t { ++ float32x2_t val[3]; ++}; ++struct float16x8x3_t { ++ float16x8_t val[3]; ++}; ++ ++typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy ++typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy ++typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy ++typedef float16x8x3_t float16x4x3_t; ++ ++ ++//**************************************************************************** ++//****** Porting auxiliary macros ******************************************** ++ ++//** floating point related macros ** ++#define _M128i(a) _mm_castps_si128(a) ++#define _M128(a) _mm_castsi128_ps(a) ++//here the most performance effective implementation is compiler and 32/64 bits build dependent ++#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) ) ++ ++ #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a))) ++ #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp); ++ #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp)); ++#else ++ //for 32bit gcc and Microsoft compilers builds ++ #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a)) ++ #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp) ++ #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp)) ++#endif ++#define _pM128(a) _mm_castsi128_ps(_pM128i(a)) ++ ++#define return64(a) _M64(res64,a); return res64; ++#define return64f(a) _M64f(res64,a); return res64; ++ ++#define _Ui64(a) (*(uint64_t*)&(a)) ++#define _UNSIGNED_T(a) u ## a ++ ++#define _SIGNBIT64 ((uint64_t)1 << 63) ++#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6)) ++#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) ) ++ ++#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it" ++#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it" ++ ++//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++#define __constrange(min,max) const ++#define __transfersize(size) ++//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++ ++//************************************************************************* ++//************************************************************************* ++//********* Functions declarations as declared in original arm_neon.h ***** ++//************************************************************************* ++//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes. ++int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 ++int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 ++int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 ++int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 ++float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 ++uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 ++uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 ++uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 ++uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 ++int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 ++int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 ++int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 ++int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 ++float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 ++uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 ++uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 ++uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 ++uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 ++//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. ++int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 ++int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 ++int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 ++uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 ++uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0 ++uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 ++//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i] ++int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 ++int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 ++int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 ++uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 ++uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0 ++uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 ++//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ++int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 ++int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 ++int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 ++uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0 ++uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0 ++uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 ++int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 ++int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0 ++int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 ++uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 ++uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0 ++uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 ++//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1 ++int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 ++int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 ++int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 ++uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 ++uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0 ++uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 ++int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 ++int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 ++int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 ++uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 ++uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0 ++uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 ++//Vector saturating add: vqadd -> Vr[i]:=sat(Va[i]+Vb[i]) ++int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 ++int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 ++int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 ++int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 ++uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 ++uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0 ++uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 ++uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 ++int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 ++int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 ++int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 ++int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 ++uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 ++uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0 ++uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 ++uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 ++//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i] ++int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 ++int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 ++int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 ++uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 ++uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 ++uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 ++//Vector rounding add high half: vraddhn ++int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 ++int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 ++int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 ++uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 ++uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 ++uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 ++//Multiplication ++//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] ++int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 ++int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 ++int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 ++float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 ++uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 ++uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 ++uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 ++poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 ++int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 ++int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 ++int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 ++float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 ++uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 ++uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 ++uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 ++poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 ++//multiply lane ++int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); ++int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); ++float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); ++uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); ++uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); ++int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c); ++int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); ++float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); ++uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); ++uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); ++//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ++int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 ++int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 ++int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 ++float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 ++uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 ++uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 ++uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 ++int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 ++int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 ++int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 ++float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 ++uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 ++uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 ++uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 ++//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i] ++int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 ++int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 ++int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 ++uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 ++uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0 ++uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 ++//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ++int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 ++int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 ++int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 ++float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 ++uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 ++uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 ++uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 ++int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 ++int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 ++int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 ++float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 ++uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 ++uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 ++uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 ++//Vector multiply subtract long ++int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 ++int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 ++int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 ++uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 ++uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0 ++uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 ++//Vector saturating doubling multiply high ++int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 ++int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 ++int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 ++int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 ++//Vector saturating rounding doubling multiply high ++int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 ++int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 ++int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 ++int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 ++//Vector saturating doubling multiply accumulate long ++int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 ++int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 ++//Vector saturating doubling multiply subtract long ++int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 ++int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 ++//Vector long multiply ++int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 ++int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 ++int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 ++uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 ++uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0 ++uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 ++poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 ++//Vector saturating doubling long multiply ++int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 ++int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 ++//Subtraction ++//Vector subtract ++int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 ++int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 ++int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 ++int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 ++float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 ++uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 ++uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 ++uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 ++uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 ++int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 ++int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 ++int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 ++int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 ++float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 ++uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 ++uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 ++uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 ++uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 ++//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i] ++int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 ++int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 ++int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 ++uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 ++uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0 ++uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 ++//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i] ++int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 ++int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 ++int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 ++uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 ++uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0 ++uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 ++//Vector saturating subtract ++int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 ++int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 ++int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 ++int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 ++uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 ++uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0 ++uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 ++uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 ++int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 ++int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 ++int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 ++int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 ++uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 ++uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0 ++uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 ++uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0 ++//Vector halving subtract ++int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 ++int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 ++int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 ++uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 ++uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0 ++uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 ++int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 ++int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 ++int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 ++uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 ++uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0 ++uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 ++//Vector subtract high half ++int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 ++int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 ++int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 ++uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 ++uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 ++uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 ++//Vector rounding subtract high half ++int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++//Comparison ++//Vector compare equal ++uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 ++uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 ++uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 ++uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 ++uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 ++uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 ++uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 ++uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 ++uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 ++uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 ++uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 ++uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 ++uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 ++uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 ++uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 ++uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 ++//Vector compare greater-than or equal ++uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 ++uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 ++uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 ++uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++//Vector compare less-than or equal ++uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 ++uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 ++uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 ++uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++//Vector compare greater-than ++uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 ++uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 ++uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++//Vector compare less-than ++uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 ++uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 ++uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++//Vector compare absolute greater-than or equal ++uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++//Vector compare absolute less-than or equal ++uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++//Vector compare absolute greater-than ++uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++//Vector compare absolute less-than ++uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++//Vector test bits ++uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 ++uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 ++uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 ++uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 ++uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 ++uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 ++uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 ++uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 ++uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 ++uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 ++uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 ++uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 ++uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 ++uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 ++//Absolute difference ++//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] | ++int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 ++int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 ++int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 ++uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 ++uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0 ++uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 ++float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 ++int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 ++int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 ++int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 ++uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 ++uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0 ++uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 ++float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 ++//Absolute difference - long ++int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 ++int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 ++int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 ++uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 ++uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0 ++uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 ++//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ++int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 ++int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 ++int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 ++uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 ++uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0 ++uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 ++int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 ++int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 ++int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 ++uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 ++uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0 ++uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 ++//Absolute difference and accumulate - long ++int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 ++int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 ++int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 ++uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 ++uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0 ++uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 ++//Max/Min ++//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ++int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 ++int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 ++int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 ++uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 ++uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0 ++uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 ++float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 ++int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 ++int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 ++int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 ++uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 ++uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 ++uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 ++float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 ++//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ++int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 ++int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 ++int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 ++uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 ++uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0 ++uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 ++float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 ++int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 ++int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 ++int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 ++uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 ++uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 ++uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 ++float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 ++//Pairwise addition ++//Pairwise add ++int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 ++int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 ++int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 ++uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 ++uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 ++uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 ++float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 ++//Long pairwise add ++int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 ++int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 ++int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 ++uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 ++uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0 ++uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 ++int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 ++int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 ++int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 ++uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 ++uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0 ++uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 ++//Long pairwise add and accumulate ++int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 ++int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 ++int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 ++uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 ++uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0 ++uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 ++int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 ++int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 ++int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 ++uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 ++uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0 ++uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 ++//Folding maximum vpmax -> takes maximum of adjacent pairs ++int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 ++int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 ++int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 ++uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 ++uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0 ++uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 ++float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 ++//Folding minimum vpmin -> takes minimum of adjacent pairs ++int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 ++int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 ++int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 ++uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 ++uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0 ++uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 ++float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 ++//Reciprocal/Sqrt ++float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 ++float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 ++float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 ++float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 ++//Shifts by signed variable ++//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ++int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 ++int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 ++int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 ++int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 ++uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 ++uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0 ++uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 ++uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 ++int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 ++int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 ++int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 ++int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 ++uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 ++uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0 ++uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 ++uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 ++//Vector saturating shift left: (negative values shift right) ++int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 ++int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 ++int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 ++int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 ++uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 ++uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0 ++uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 ++uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 ++int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 ++int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 ++int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 ++int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 ++uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 ++uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0 ++uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 ++uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 ++//Vector rounding shift left: (negative values shift right) ++int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 ++int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 ++int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 ++int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 ++uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 ++uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0 ++uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 ++uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 ++int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 ++int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 ++int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 ++int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 ++uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 ++uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0 ++uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 ++uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 ++//Vector saturating rounding shift left: (negative values shift right) ++int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 ++int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 ++int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 ++int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 ++uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 ++uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0 ++uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 ++uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 ++int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 ++int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 ++int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 ++int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 ++uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 ++uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0 ++uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 ++uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 ++//Shifts by a constant ++//Vector shift right by constant ++int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 ++int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 ++int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 ++int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 ++uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 ++uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16 ++uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 ++uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 ++int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 ++int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 ++int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 ++int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 ++uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 ++uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16 ++uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 ++uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 ++//Vector shift left by constant ++int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++//Vector rounding shift right by constant ++int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 ++int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 ++int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 ++int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 ++uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 ++uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16 ++uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 ++uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 ++int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 ++int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 ++int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 ++int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 ++uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 ++uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16 ++uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 ++uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 ++//Vector shift right by constant and accumulate ++int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 ++int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 ++int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 ++int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 ++uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 ++uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16 ++uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 ++uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 ++int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 ++int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 ++int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 ++int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 ++uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 ++uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16 ++uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 ++uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 ++//Vector rounding shift right by constant and accumulate ++int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 ++int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 ++int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 ++int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 ++uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 ++uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16 ++uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 ++uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 ++int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 ++int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 ++int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 ++int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 ++uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 ++uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16 ++uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 ++uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 ++//Vector saturating shift left by constant ++int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 ++int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 ++int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 ++int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 ++uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 ++uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0 ++uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 ++uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 ++int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 ++int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 ++int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 ++int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 ++uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 ++uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0 ++uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 ++uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 ++//Vector signed->unsigned saturating shift left by constant ++uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 ++uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 ++uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 ++uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 ++uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 ++uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 ++uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 ++uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 ++//Vector narrowing shift right by constant ++int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++//Vector signed->unsigned narrowing saturating shift right by constant ++uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 ++uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 ++uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 ++//Vector signed->unsigned rounding narrowing saturating shift right by constant ++uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 ++uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 ++uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 ++//Vector narrowing saturating shift right by constant ++int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 ++int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 ++int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 ++uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8 ++uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 ++uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 ++//Vector rounding narrowing shift right by constant ++int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++//Vector rounding narrowing saturating shift right by constant ++int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 ++int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 ++int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 ++uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8 ++uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 ++uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 ++//Vector widening shift left by constant ++int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 ++int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 ++int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 ++uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 ++uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0 ++uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 ++//Shifts with insert ++//Vector shift right and insert ++int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++//Vector shift left and insert ++int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++//Loads of a single vector or lane. Perform loads and stores of a single vector of some type. ++//Load a single vector from memory ++uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] ++float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] ++uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] ++uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] ++uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] ++int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] ++int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] ++int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] ++float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] ++poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] ++poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] ++//Load a single lane from memory ++uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] ++uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] ++int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] ++int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] ++int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0] ++float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] ++float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0] ++poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] ++poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] ++uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] ++uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] ++int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0] ++int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] ++float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] ++poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] ++poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] ++//Load all lanes of vector with same value from memory ++uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++//Store a single vector or lane. Stores all lanes or a single lane of a vector. ++//Store a single vector into memory ++void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] ++void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] ++void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] ++void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] ++void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] ++void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] ++void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] ++void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] ++void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] ++void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] ++void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] ++void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] ++void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] ++void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] ++void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] ++void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] ++void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] ++void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] ++void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] ++void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] ++void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] ++//Store a lane of a vector into memory ++//Loads of an N-element structure ++//Load N-element structure from memory ++uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] ++float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] ++float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++//Load all lanes of N-element structure with same value from memory ++uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++//Load a single lane of N-element structure from memory ++//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned ++uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] ++int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] ++float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] ++poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] ++uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] ++uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] ++int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0] ++int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0] ++//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] ++poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++//Store N-element structure to memory ++void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0] ++void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0] ++void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0] ++void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0] ++void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0] ++void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0] ++void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0] ++void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0] ++void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0] ++void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0] ++void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0] ++//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0] ++void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] ++void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] ++void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] ++void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] ++void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] ++void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] ++void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] ++void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] ++void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] ++void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] ++void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] ++void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] ++void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] ++void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] ++void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] ++void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] ++void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] ++void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] ++void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] ++void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] ++void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] ++void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] ++void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] ++void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] ++//Store a single lane of N-element structure to memory ++void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] ++void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] ++void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0] ++void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] ++void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] ++void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] ++void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0] ++void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] ++void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] ++void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] ++void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] ++void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] ++void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0] ++void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] ++void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] ++//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector. ++uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] ++uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] ++int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] ++int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] ++float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] ++uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] ++int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] ++int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] ++float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector. ++uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++//Initialize a vector from a literal bit pattern. ++int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 ++int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 ++int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 ++float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 ++float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 ++uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 ++uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 ++uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 ++uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 ++poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 ++poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 ++int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 ++//Set all lanes to same value ++//Load all lanes of vector to the same literal value ++uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 ++uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 ++uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 ++int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 ++int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 ++int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 ++poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 ++poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 ++float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 ++uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 ++int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 ++int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 ++poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 ++int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 ++int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 ++uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 ++uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 ++int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 ++int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 ++int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 ++poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 ++poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 ++float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 ++uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 ++int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 ++int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 ++poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 ++int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 ++int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 ++uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++//Load all lanes of the vector to the value of a lane of a vector ++uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector. ++int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 ++int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 ++int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 ++int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 ++float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 ++float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 ++uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 ++uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 ++uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 ++uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 ++poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 ++poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 ++//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors ++int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 ++int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 ++int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 ++int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 ++float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 ++float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 ++uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 ++uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 ++uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 ++uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 ++poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 ++poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 ++int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 ++int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 ++int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 ++int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 ++float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 ++float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 ++uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 ++uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 ++uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 ++uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 ++poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 ++poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 ++//Converting vectors. These intrinsics are used to convert vectors. ++//Convert from float ++int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 ++uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 ++int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 ++uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 ++int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 ++uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 ++int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 ++uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 ++//Convert to float ++float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 ++float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 ++float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 ++float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 ++float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 ++float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 ++float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 ++float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 ++//Convert between floats ++float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 ++float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 ++//Vector narrow integer ++int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 ++int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 ++int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 ++uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 ++uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 ++uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 ++//Vector long move ++int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 ++int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 ++int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 ++uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 ++uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0 ++uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 ++//Vector saturating narrow integer ++int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 ++int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 ++int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 ++uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0 ++uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0 ++uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0 ++//Vector saturating narrow integer signed->unsigned ++uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 ++uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 ++uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 ++//Table look up ++uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 ++poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++//Extended table look up intrinsics ++uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 ++poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++//Operations with a scalar value ++//Vector multiply accumulate with scalar ++int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] ++int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] ++uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] ++uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] ++float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0] ++int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0] ++int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0] ++uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0] ++uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0] ++float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0] ++//Vector widening multiply accumulate with scalar ++int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0] ++int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0] ++uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0] ++uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0] ++//Vector widening saturating doubling multiply accumulate with scalar ++int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0] ++int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0] ++//Vector multiply subtract with scalar ++int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] ++int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] ++uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] ++uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] ++float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0] ++int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0] ++int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0] ++uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0] ++uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0] ++float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0] ++//Vector widening multiply subtract with scalar ++int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0] ++int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0] ++uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0] ++uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0] ++//Vector widening saturating doubling multiply subtract with scalar ++int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0] ++int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0] ++//Vector multiply by scalar ++int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] ++int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] ++float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] ++uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] ++uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] ++int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] ++int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] ++float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] ++uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] ++uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] ++//Vector long multiply with scalar ++int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] ++int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] ++uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0] ++uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] ++//Vector long multiply by scalar ++int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] ++int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] ++uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0] ++uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] ++//Vector saturating doubling long multiply with scalar ++int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] ++int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] ++//Vector saturating doubling long multiply by scalar ++int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] ++int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] ++//Vector saturating doubling multiply high with scalar ++int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] ++int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] ++int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] ++int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] ++//Vector saturating doubling multiply high by scalar ++int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] ++int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] ++int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] ++int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] ++//Vector saturating rounding doubling multiply high with scalar ++int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] ++int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] ++int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] ++int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] ++//Vector rounding saturating doubling multiply high by scalar ++int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] ++int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] ++int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] ++int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] ++//Vector multiply accumulate with scalar ++int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] ++int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] ++uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] ++uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] ++float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] ++int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] ++int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] ++uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] ++uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] ++float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] ++//Vector widening multiply accumulate with scalar ++int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] ++int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] ++uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0] ++uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] ++//Vector widening saturating doubling multiply accumulate with scalar ++int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] ++int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] ++//Vector multiply subtract with scalar ++int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] ++int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] ++uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] ++uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] ++float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] ++int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] ++int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] ++uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] ++uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] ++float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] ++//Vector widening multiply subtract with scalar ++int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] ++int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] ++uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0] ++uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] ++//Vector widening saturating doubling multiply subtract with scalar ++int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] ++int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] ++//Vector extract ++int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 ++uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 ++poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 ++int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 ++uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 ++int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 ++uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 ++float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 ++int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 ++uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 ++poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 ++int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 ++uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 ++poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 ++int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 ++uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 ++int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 ++uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 ++float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0 ++//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. ++int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0 ++int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0 ++int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0 ++uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0 ++uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0 ++uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0 ++poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0 ++poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0 ++float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0 ++int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 ++int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 ++int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 ++uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 ++uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 ++uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 ++poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 ++poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 ++float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 ++int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0 ++int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0 ++uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0 ++uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0 ++poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0 ++poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0 ++int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 ++int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 ++uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 ++uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 ++poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 ++poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0 ++int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0 ++uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0 ++poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0 ++int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 ++uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 ++poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 ++//Other single operand arithmetic ++//Absolute: Vd[i] = |Va[i]| ++int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0 ++int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0 ++int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0 ++float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0 ++int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 ++int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 ++int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 ++float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 ++//Saturating absolute: Vd[i] = sat(|Va[i]|) ++int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 ++int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 ++int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0 ++int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 ++int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 ++int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 ++//Negate: Vd[i] = - Va[i] ++int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0 ++int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0 ++int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0 ++float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0 ++int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 ++int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 ++int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 ++float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 ++//Saturating Negate: sat(Vd[i] = - Va[i]) ++int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0 ++int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0 ++int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0 ++int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 ++int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 ++int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 ++//Count leading sign bits ++int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 ++int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 ++int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 ++int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 ++int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 ++int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 ++//Count leading zeros ++int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0 ++int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0 ++int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0 ++uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0 ++uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0 ++uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0 ++int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 ++int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 ++int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 ++uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 ++uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 ++uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 ++//Count number of set bits ++uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 ++int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 ++poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 ++uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 ++int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 ++poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 ++//Reciprocal estimate ++float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 ++uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 ++float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 ++uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 ++//Reciprocal square root estimate ++float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 ++uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 ++float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 ++uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 ++//Logical operations ++//Bitwise not ++int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 ++int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 ++int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 ++uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 ++uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 ++uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 ++poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 ++int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 ++int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 ++int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 ++uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 ++uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 ++uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 ++poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 ++//Bitwise and ++int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 ++int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 ++int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 ++int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 ++uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 ++uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 ++uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 ++uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 ++int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 ++int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 ++int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 ++int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 ++uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 ++uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 ++uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 ++uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 ++//Bitwise or ++int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 ++int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 ++int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 ++int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 ++uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 ++uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 ++uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 ++uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 ++int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 ++int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 ++int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 ++int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 ++uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 ++uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 ++uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 ++uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 ++//Bitwise exclusive or (EOR or XOR) ++int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 ++int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 ++int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 ++int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 ++uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 ++uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 ++uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 ++uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 ++int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 ++int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 ++int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 ++int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 ++uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 ++uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 ++uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 ++uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 ++//Bit Clear ++int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 ++int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 ++int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 ++int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 ++uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 ++uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 ++uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 ++uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 ++int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 ++int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 ++int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 ++int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 ++uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 ++uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 ++uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 ++uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 ++//Bitwise OR complement ++int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 ++int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 ++int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 ++int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 ++uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 ++uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 ++uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 ++uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 ++int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 ++int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 ++int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 ++int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 ++uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 ++uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 ++uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 ++uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 ++//Bitwise Select ++int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 ++int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 ++int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 ++int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 ++uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 ++uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 ++uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 ++uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 ++float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 ++poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 ++poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 ++int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 ++int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 ++int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 ++int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 ++uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 ++uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 ++uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 ++uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 ++float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 ++poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 ++poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 ++//Transposition operations ++//Transpose elements ++int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 ++int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 ++int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 ++uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 ++uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 ++uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 ++float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 ++poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 ++poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 ++int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 ++int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 ++int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 ++uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 ++uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 ++uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 ++float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 ++poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 ++poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 ++//Interleave elements ++int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 ++int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 ++int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 ++uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 ++uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 ++uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 ++float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 ++poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 ++poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 ++int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 ++int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 ++int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 ++uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 ++uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 ++uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 ++float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 ++poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 ++poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 ++//De-Interleave elements ++int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 ++int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 ++int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 ++uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 ++uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 ++uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 ++float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 ++poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 ++poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 ++int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 ++int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 ++int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 ++uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 ++uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 ++uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 ++float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 ++poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 ++poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 ++ ++ ++//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must, ++//for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal ++// ++#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers. ++ ++ #define _MM_ALIGNR_EPI8 _mm_alignr_epi8 ++ ++ #define _MM_EXTRACT_EPI16 _mm_extract_epi16 ++ #define _MM_INSERT_EPI16 _mm_insert_epi16 ++#ifdef USE_SSE4 ++ #define _MM_EXTRACT_EPI8 _mm_extract_epi8 ++ #define _MM_EXTRACT_EPI32 _mm_extract_epi32 ++ #define _MM_EXTRACT_PS _mm_extract_ps ++ ++ #define _MM_INSERT_EPI8 _mm_insert_epi8 ++ #define _MM_INSERT_EPI32 _mm_insert_epi32 ++ #define _MM_INSERT_PS _mm_insert_ps ++#ifdef _NEON2SSE_64BIT ++ #define _MM_INSERT_EPI64 _mm_insert_epi64 ++ #define _MM_EXTRACT_EPI64 _mm_extract_epi64 ++#endif ++#endif //SSE4 ++#else ++ #define _NEON2SSE_COMMA , ++ #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \ ++ switch(LANE) \ ++ { \ ++ case 0: return NAME(a b, 0); \ ++ case 1: return NAME(a b, 1); \ ++ case 2: return NAME(a b, 2); \ ++ case 3: return NAME(a b, 3); \ ++ case 4: return NAME(a b, 4); \ ++ case 5: return NAME(a b, 5); \ ++ case 6: return NAME(a b, 6); \ ++ case 7: return NAME(a b, 7); \ ++ case 8: return NAME(a b, 8); \ ++ case 9: return NAME(a b, 9); \ ++ case 10: return NAME(a b, 10); \ ++ case 11: return NAME(a b, 11); \ ++ case 12: return NAME(a b, 12); \ ++ case 13: return NAME(a b, 13); \ ++ case 14: return NAME(a b, 14); \ ++ case 15: return NAME(a b, 15); \ ++ default: return NAME(a b, 0); \ ++ } ++ ++ #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \ ++ switch(LANE) \ ++ { \ ++ case 0: return NAME(vec p,0); \ ++ case 1: return NAME(vec p,1); \ ++ case 2: return NAME(vec p,2); \ ++ case 3: return NAME(vec p,3); \ ++ case 4: return NAME(vec p,4); \ ++ case 5: return NAME(vec p,5); \ ++ case 6: return NAME(vec p,6); \ ++ case 7: return NAME(vec p,7); \ ++ default: return NAME(vec p,0); \ ++ } ++ ++ #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \ ++ switch(LANE) \ ++ { \ ++ case case0: return NAME(vec p,case0); \ ++ case case1: return NAME(vec p,case1); \ ++ case case2: return NAME(vec p,case2); \ ++ case case3: return NAME(vec p,case3); \ ++ default: return NAME(vec p,case0); \ ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) ++ { ++ _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE) ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p) ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,) ++ } ++ ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,) ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,) ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE) ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p) ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE) ++ } ++ ++#ifdef _NEON2SSE_64BIT ++ //the special case of functions available only for SSE4 and 64-bit build. ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE) ++ { ++ switch(LANE) { ++ case 0: ++ return _mm_insert_epi64(vec, p, 0); ++ case 1: ++ return _mm_insert_epi64(vec, p, 1); ++ default: ++ return _mm_insert_epi64(vec, p, 0); ++ } ++ } ++ ++ _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE) ++ { ++ if (LANE ==0) return _mm_extract_epi64(val, 0); ++ else return _mm_extract_epi64(val, 1); ++ } ++#endif ++ ++ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) ++ { ++ _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p) ++ } ++ ++#endif //USE_SSE4 ++ ++#endif //#ifdef NDEBUG ++ ++//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices ++// or for some specific commonly used operations implementation missing in SSE ++#ifdef USE_SSE4 ++ #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16 ++ #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32 ++ #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64 ++ ++ #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16 ++ #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32 ++ #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64 ++ ++ #define _MM_MAX_EPI8 _mm_max_epi8 ++ #define _MM_MAX_EPI32 _mm_max_epi32 ++ #define _MM_MAX_EPU16 _mm_max_epu16 ++ #define _MM_MAX_EPU32 _mm_max_epu32 ++ ++ #define _MM_MIN_EPI8 _mm_min_epi8 ++ #define _MM_MIN_EPI32 _mm_min_epi32 ++ #define _MM_MIN_EPU16 _mm_min_epu16 ++ #define _MM_MIN_EPU32 _mm_min_epu32 ++ ++ #define _MM_BLENDV_EPI8 _mm_blendv_epi8 ++ #define _MM_PACKUS_EPI32 _mm_packus_epi32 ++ #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a) ++ ++ #define _MM_MULLO_EPI32 _mm_mullo_epi32 ++ #define _MM_MUL_EPI32 _mm_mul_epi32 ++ ++ #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64 ++#else //no SSE4 !!!!!! ++ _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ return _mm_unpacklo_epi8(a, zero); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ return _mm_unpacklo_epi16(a, zero); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ return _mm_unpacklo_epi32(a, zero); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i sign = _mm_cmpgt_epi8(zero, a); ++ return _mm_unpacklo_epi8(a, sign); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i sign = _mm_cmpgt_epi16(zero, a); ++ return _mm_unpacklo_epi16(a, sign); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a) ++ { ++ __m128i zero = _mm_setzero_si128(); ++ __m128i sign = _mm_cmpgt_epi32(zero, a); ++ return _mm_unpacklo_epi32(a, sign); ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t tmp[4]; ++ _mm_store_si128((__m128i*)tmp, vec); ++ return tmp[LANE]; ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int8_t tmp[16]; ++ _mm_store_si128((__m128i*)tmp, vec); ++ return (int)tmp[LANE]; ++ } ++ ++ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t tmp[4]; ++ _mm_store_si128((__m128i*)tmp, _M128i(vec)); ++ return tmp[LANE]; ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0}; ++ _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; ++ __m128i vec_masked, p_masked; ++ pvec[LANE] = p; ++ mask[LANE] = 0x0; ++ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p ++ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec ++ return _mm_or_si128(vec_masked, p_masked); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; ++ _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; ++ __m128i vec_masked, p_masked; ++ pvec[LANE] = (int8_t)p; ++ mask[LANE] = 0x0; ++ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p ++ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec ++ return _mm_or_si128(vec_masked, p_masked); ++ } ++ ++ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) ++ { ++ _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; ++ __m128 tmp, vec_masked, p_masked; ++ mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it ++ vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p ++ p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec ++ tmp = _mm_or_ps(vec_masked, p_masked); ++ return tmp; ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi8 (a, b); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi32(a, b); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b) ++ { ++ __m128i c8000, b_s, a_s, cmp; ++ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff ++ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 ++ b_s = _mm_sub_epi16 (b, c8000); ++ a_s = _mm_sub_epi16 (a, c8000); ++ cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b) ++ { ++ __m128i c80000000, b_s, a_s, cmp; ++ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff ++ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 ++ b_s = _mm_sub_epi32 (b, c80000000); ++ a_s = _mm_sub_epi32 (a, c80000000); ++ cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi8 (b, a); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b) ++ { ++ __m128i cmp, resa, resb; ++ cmp = _mm_cmpgt_epi32(b, a); ++ resa = _mm_and_si128 (cmp, a); ++ resb = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(resa, resb); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b) ++ { ++ __m128i c8000, b_s, a_s, cmp; ++ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff ++ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 ++ b_s = _mm_sub_epi16 (b, c8000); ++ a_s = _mm_sub_epi16 (a, c8000); ++ cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b) ++ { ++ __m128i c80000000, b_s, a_s, cmp; ++ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff ++ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 ++ b_s = _mm_sub_epi32 (b, c80000000); ++ a_s = _mm_sub_epi32 (a, c80000000); ++ cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed ++ a_s = _mm_and_si128 (cmp,a); ++ b_s = _mm_andnot_si128 (cmp,b); ++ return _mm_or_si128(a_s, b_s); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below ++ { ++ //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters. ++ __m128i a_masked, b_masked; ++ b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff ++ a_masked = _mm_andnot_si128 (mask,a); ++ return _mm_or_si128(a_masked, b_masked); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b) ++ { ++ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; ++ __m128i a16, b16, res, reshi,cmp, zero; ++ zero = _mm_setzero_si128(); ++ a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); ++ b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); ++ res = _mm_unpacklo_epi64(a16, b16); //result without saturation ++ reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation ++ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero ++ res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 ++ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive ++ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a) ++ { ++ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; ++ __m128i a16, res, reshi,cmp, zero; ++ zero = _mm_setzero_si128(); ++ a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); ++ reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation ++ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero ++ res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 ++ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive ++ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff ++ } ++ ++ ++ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL) ++ { ++ _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4]; ++ int64_t res64; ++ int i; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ for (i = 0; i<4; i++) { ++ res64 = atmp[i] * btmp[i]; ++ res[i] = (int)(res64 & 0xffffffff); ++ } ++ return _mm_load_si128((__m128i*)res); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b) ++ { ++ __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg; ++ sign = _mm_xor_si128 (a, b); ++ sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive ++ zero = _mm_setzero_si128(); ++ a_neg = _mm_abs_epi32 (a); //negate a and b ++ b_neg = _mm_abs_epi32 (b); //negate a and b ++ mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result ++ mul_us_neg = _mm_sub_epi64(zero, mul_us); ++ mul_us_neg = _mm_and_si128(sign, mul_us_neg); ++ mul_us = _mm_andnot_si128(sign, mul_us); ++ return _mm_or_si128 (mul_us, mul_us_neg); ++ } ++ ++ _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b) ++ { ++ __m128i res; ++ res = _mm_cmpeq_epi32 (a, b); ++ return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data ++ } ++#endif //SSE4 ++ ++//the special case of functions working only for 32 bits, no SSE4 ++_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0}; ++ _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff}; ++ __m128i vec_masked, p_masked; ++ pvec[LANE] = p; ++ mask[LANE] = 0x0; ++ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p ++ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec ++ return _mm_or_si128(vec_masked, p_masked); ++} ++ ++_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE) ++{ ++ _NEON2SSE_ALIGN_16 int64_t tmp[2]; ++ _mm_store_si128((__m128i*)tmp, val); ++ return tmp[LANE]; ++} ++ ++#ifndef _NEON2SSE_64BIT_SSE4 ++ #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32 ++ #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32 ++#endif ++ ++int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints ++_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a) ++{ ++ //Overflow happens only if a and sum have the opposite signs ++ __m128i c7fffffff, res, res_sat, res_xor_a; ++ c7fffffff = _mm_set1_epi32(0x7fffffff); ++ res = _mm_slli_epi32 (a, 1); // res = a*2 ++ res_sat = _mm_srli_epi32(a, 31); ++ res_sat = _mm_add_epi32(res_sat, c7fffffff); ++ res_xor_a = _mm_xor_si128(res, a); ++ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise ++ res_sat = _mm_and_si128(res_xor_a, res_sat); ++ res = _mm_andnot_si128(res_xor_a, res); ++ return _mm_or_si128(res, res_sat); ++} ++ ++ ++//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ++//************************************************************************* ++//************************************************************************* ++//***************** Functions redefinition\implementatin starts here ***** ++//************************************************************************* ++//************************************************************************* ++//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ++ ++/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample: ++#ifdef ARM ++#define vector_addq_s32 _mm_add_epi32 ++#else //if we have IA ++#define vector_addq_s32 vadd_s32 ++#endif ++ ++******************************************************************************************** ++Functions below are organised in the following way: ++ ++Each NEON intrinsic function has one of the following options: ++1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement ++2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement ++3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition ++4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance ++the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path ++- please consider such functions removal from your code. ++*/ ++ ++//*********************************************************************** ++//************************ Vector add ***************************** ++//*********************************************************************** ++int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_add_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_add_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_add_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res64; ++ res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0]; ++ return res64; ++} ++ ++ ++float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b) ++{ ++ __m128 res; ++ __m64_128 res64; ++ res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 ++#define vadd_u8 vadd_s8 ++ ++uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 ++#define vadd_u16 vadd_s16 ++ ++uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 ++#define vadd_u32 vadd_s32 ++ ++uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 ++_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) ++{ ++ uint64x1_t res64; ++ res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0]; ++ return res64; ++} ++ ++ ++int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 ++#define vaddq_s8 _mm_add_epi8 ++ ++int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 ++#define vaddq_s16 _mm_add_epi16 ++ ++int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 ++#define vaddq_s32 _mm_add_epi32 ++ ++int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 ++#define vaddq_s64 _mm_add_epi64 ++ ++float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 ++#define vaddq_f32 _mm_add_ps ++ ++uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 ++#define vaddq_u8 _mm_add_epi8 ++ ++uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 ++#define vaddq_u16 _mm_add_epi16 ++ ++uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 ++#define vaddq_u32 _mm_add_epi32 ++ ++uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 ++#define vaddq_u64 _mm_add_epi64 ++ ++//**************************** Vector long add *****************************: ++//*********************************************************************** ++//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. ++int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_add_epi16 (a16, b16); ++} ++ ++int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi32 (a32, b32); ++} ++ ++int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 ( a64, b64); ++} ++ ++uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1 ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi16 (a16, b16); ++} ++ ++uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi32 (a32, b32); ++} ++ ++uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 (a64, b64); ++} ++ ++//*************** Vector wide add: vaddw_. Vr[i]:=Va[i]+Vb[i] ****************** ++//*************** ********************************************************************* ++int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 ++_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_add_epi16 (a, b16); ++} ++ ++int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 ++_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1, ++ return _mm_add_epi32 (a, b32); ++} ++ ++int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 ++_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 (a, b64); ++} ++ ++uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 ++_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi16 (a, b16); ++} ++ ++uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0 ++_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi32 (a, b32); ++} ++ ++uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 ++_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_add_epi64 (a, b64); ++} ++ ++//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated ******************************* ++//************************************************************************************************************************* ++int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vhaddq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64( vhaddq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64( vhaddq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64( vhaddq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64( vhaddq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64( vhaddq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b) ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = vshrq_n_s8(tmp2,1); ++ return _mm_add_epi8(tmp1,tmp2); ++} ++ ++int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b) ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = _mm_srai_epi16(tmp2,1); ++ return _mm_add_epi16(tmp1,tmp2); ++} ++ ++int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0 ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = _mm_srai_epi32(tmp2,1); ++ return _mm_add_epi32(tmp1,tmp2); ++} ++ ++uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0 ++{ ++ __m128i c1, sum, res; ++ c1 = _mm_set1_epi8(1); ++ sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it ++ res = _mm_xor_si128(a, b); //for rounding compensation ++ res = _mm_and_si128(res,c1); //for rounding compensation ++ return _mm_sub_epi8 (sum, res); //actual rounding compensation ++} ++ ++uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0 ++{ ++ __m128i sum, res; ++ sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it ++ res = _mm_xor_si128(a, b); //for rounding compensation ++ res = _mm_slli_epi16 (res,15); //shift left then back right to ++ res = _mm_srli_epi16 (res,15); //get 1 or zero ++ return _mm_sub_epi16 (sum, res); //actual rounding compensation ++} ++ ++uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0 ++{ ++ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_and_si128(a,b); ++ tmp2 = _mm_xor_si128(a,b); ++ tmp2 = _mm_srli_epi32(tmp2,1); ++ return _mm_add_epi32(tmp1,tmp2); ++} ++ ++//************************Vector rounding halving add: vrhadd{q}_. Vr[i]:=(Va[i]+Vb[i]+1)>>1 *************************** ++//***************************************************************************************************************************** ++int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vrhaddq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vrhaddq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vrhaddq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! ++} ++ ++ ++uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! ++} ++ ++ ++uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vrhaddq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0 ++{ ++ //no signed average in x86 SIMD, go to unsigned ++ __m128i c128, au, bu, sum; ++ c128 = _mm_set1_epi8(0x80); //-128 ++ au = _mm_sub_epi8(a, c128); //add 128 ++ bu = _mm_sub_epi8(b, c128); //add 128 ++ sum = _mm_avg_epu8(au, bu); ++ return _mm_add_epi8 (sum, c128); //sub 128 ++} ++ ++int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0 ++{ ++ //no signed average in x86 SIMD, go to unsigned ++ __m128i cx8000, au, bu, sum; ++ cx8000 = _mm_set1_epi16(0x8000); // - 32768 ++ au = _mm_sub_epi16(a, cx8000); //add 32768 ++ bu = _mm_sub_epi16(b, cx8000); //add 32768 ++ sum = _mm_avg_epu16(au, bu); ++ return _mm_add_epi16 (sum, cx8000); //sub 32768 ++} ++ ++int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b) ++{ ++ //need to avoid overflow ++ __m128i a2, b2, res, sum; ++ a2 = _mm_srai_epi32(a,1); //a2=a/2; ++ b2 = _mm_srai_epi32(b,1); // b2=b/2; ++ res = _mm_or_si128(a,b); //for rounding ++ res = _mm_slli_epi32 (res,31); //shift left then back right to ++ res = _mm_srli_epi32 (res,31); //get 1 or zero ++ sum = _mm_add_epi32(a2,b2); ++ return _mm_add_epi32(sum,res); ++} ++ ++uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 ++#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded ++ ++uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0 ++#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded ++ ++ ++uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0 ++{ ++ //need to avoid overflow ++ __m128i a2, b2, res, sum; ++ a2 = _mm_srli_epi32(a,1); //a2=a/2; ++ b2 = _mm_srli_epi32(b,1); // b2=b/2; ++ res = _mm_or_si128(a,b); //for rounding ++ res = _mm_slli_epi32 (res,31); //shift left then back right to ++ res = _mm_srli_epi32 (res,31); //get 1 or zero ++ sum = _mm_add_epi32(a2,b2); ++ return _mm_add_epi32(sum,res); ++} ++ ++//****************** VQADD: Vector saturating add ************************ ++//************************************************************************ ++int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_adds_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_adds_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vqaddq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int64x1_t res; ++ uint64_t a64, b64; ++ a64 = a.m64_u64[0]; ++ b64 = b.m64_u64[0]; ++ res.m64_u64[0] = a64 + b64; ++ a64 = (a64 >> 63) + (~_SIGNBIT64); ++ if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) { ++ res.m64_u64[0] = a64; ++ } ++ return res; ++} ++ ++uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_adds_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_adds_epu16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vqaddq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t a64, b64; ++ uint64x1_t res; ++ a64 = a.m64_u64[0]; ++ b64 = b.m64_u64[0]; ++ res.m64_u64[0] = a64 + b64; ++ if (res.m64_u64[0] < a64) { ++ res.m64_u64[0] = ~(uint64_t)0; ++ } ++ return res; ++} ++ ++int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 ++#define vqaddq_s8 _mm_adds_epi8 ++ ++int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 ++#define vqaddq_s16 _mm_adds_epi16 ++ ++int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b) ++{ ++ //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign ++ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_; ++ c7fffffff = _mm_set1_epi32(0x7fffffff); ++ res = _mm_add_epi32(a, b); ++ res_sat = _mm_srli_epi32(a, 31); ++ res_sat = _mm_add_epi32(res_sat, c7fffffff); ++ res_xor_a = _mm_xor_si128(res, a); ++ b_xor_a_ = _mm_xor_si128(b, a); ++ res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a); ++ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise ++ res_sat = _mm_and_si128(res_xor_a, res_sat); ++ res = _mm_andnot_si128(res_xor_a, res); ++ return _mm_or_si128(res, res_sat); ++} ++ ++int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = atmp[0] + btmp[0]; ++ res[1] = atmp[1] + btmp[1]; ++ ++ atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64); ++ atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64); ++ ++ if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) { ++ res[0] = atmp[0]; ++ } ++ if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) { ++ res[1] = atmp[1]; ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 ++#define vqaddq_u8 _mm_adds_epu8 ++ ++uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0 ++#define vqaddq_u16 _mm_adds_epu16 ++ ++uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b) ++{ ++ __m128i c80000000, cmp, subsum, suba, sum; ++ c80000000 = _mm_set1_epi32 (0x80000000); ++ sum = _mm_add_epi32 (a, b); ++ subsum = _mm_sub_epi32 (sum, c80000000); ++ suba = _mm_sub_epi32 (a, c80000000); ++ cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed ++ return _mm_or_si128 (sum, cmp); //saturation ++} ++ ++uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b) ++ { ++ __m128i c80000000, sum, cmp, suba, subsum; ++ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); ++ sum = _mm_add_epi64 (a, b); ++ subsum = _mm_sub_epi64 (sum, c80000000); ++ suba = _mm_sub_epi64 (a, c80000000); ++ cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!! ++ return _mm_or_si128 (sum, cmp); //saturation ++ } ++#else ++ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++ { ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = atmp[0] + btmp[0]; ++ res[1] = atmp[1] + btmp[1]; ++ if (res[0] < atmp[0]) res[0] = ~(uint64_t)0; ++ if (res[1] < atmp[1]) res[1] = ~(uint64_t)0; ++ return _mm_load_si128((__m128i*)(res)); ++ } ++#endif ++ ++ ++//******************* Vector add high half (truncated) ****************** ++//************************************************************************ ++int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sum; ++ sum = _mm_add_epi16 (a, b); ++ sum = _mm_srai_epi16 (sum, 8); ++ sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only ++ return64(sum); ++} ++ ++int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0 ++{ ++ int16x4_t res64; ++ __m128i sum; ++ sum = _mm_add_epi32 (a, b); ++ sum = _mm_srai_epi32(sum, 16); ++ sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only ++ return64(sum); ++} ++ ++int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b) ++{ ++ int32x2_t res64; ++ __m128i sum; ++ sum = _mm_add_epi64 (a, b); ++ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6)); ++ return64(sum); ++} ++ ++uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sum; ++ sum = _mm_add_epi16 (a, b); ++ sum = _mm_srli_epi16 (sum, 8); ++ sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only ++ return64(sum); ++} ++ ++uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0 ++{ ++ uint16x4_t res64; ++ __m128i sum; ++ sum = _mm_add_epi32 (a, b); ++ sum = _mm_srli_epi32 (sum, 16); ++ sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only ++ return64(sum); ++} ++ ++uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 ++#define vaddhn_u64 vaddhn_s64 ++ ++//*********** Vector rounding add high half: vraddhn_ ******************. ++//*************************************************************************** ++int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sum = _mm_srai_epi16 (sum, 8); //get high half ++ sum = _mm_add_epi16 (sum, mask1); //actual rounding ++ sum = _mm_packs_epi16 (sum, sum); ++ return64(sum); ++} ++ ++int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0 ++{ ++ //SIMD may be not optimal, serial may be faster ++ int16x4_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sum = _mm_srai_epi32 (sum, 16); //get high half ++ sum = _mm_add_epi32 (sum, mask1); //actual rounding ++ sum = _mm_packs_epi32 (sum, sum); ++ return64(sum); ++} ++ ++int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b) ++{ ++ //SIMD may be not optimal, serial may be faster ++ int32x2_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi64 (a, b); ++ mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero ++ sum = _mm_add_epi64 (sum, mask1); //actual high half rounding ++ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6)); ++ return64(sum); ++} ++ ++uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sum = _mm_srai_epi16 (sum, 8); //get high half ++ sum = _mm_add_epi16 (sum, mask1); //actual rounding ++ sum = _mm_packus_epi16 (sum, sum); ++ return64(sum); ++} ++ ++uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b) ++{ ++ //SIMD may be not optimal, serial may be faster ++ uint16x4_t res64; ++ __m128i sum, mask1; ++ sum = _mm_add_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sum = _mm_srai_epi32 (sum, 16); //get high half ++ sum = _mm_add_epi32 (sum, mask1); //actual rounding ++ sum = _MM_PACKUS1_EPI32 (sum); ++ return64(sum); ++} ++ ++uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 ++#define vraddhn_u64 vraddhn_s64 ++ ++//********************************************************************************** ++//********* Multiplication ************************************* ++//************************************************************************************** ++ ++//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] ++//As we don't go to wider result functions are equal to "multiply low" in x86 ++int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits in SSE ++ int8x8_t res64; ++ __m128i a128, b128, res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits ++ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits ++ res = _mm_mullo_epi16 (a128, b128); ++ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only ++ return64(res); ++} ++ ++int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 ++#define vmul_s16 vmul_u16 ++ ++int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 ++#define vmul_s32 vmul_u32 ++ ++float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x4_t tmp; ++ __m64_128 res64; ++ tmp = _mm_mul_ps(_pM128(a),_pM128(b)); ++ _M64f(res64, tmp); //use low 64 bits ++ return res64; ++} ++ ++uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits in SSE ++ uint8x8_t res64; ++ __m128i mask, a128, b128, res; ++ mask = _mm_set1_epi16(0xff); ++ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); ++ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); ++ res = _mm_mullo_epi16 (a128, b128); ++ res = _mm_and_si128(res, mask); //to avoid saturation ++ res = _mm_packus_epi16 (res,res); //use only low 64 bits ++ return64(res); ++} ++ ++uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0]; ++ res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1]; ++ return res; ++} ++ ++poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 ++_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b) ++{ ++ //may be optimized ++ poly8x8_t res64; ++ __m128i a64, b64, c1, res, tmp, bmasked; ++ int i; ++ a64 = _pM128i(a); ++ b64 = _pM128i(b); ++ c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff.... ++ c1 = vshrq_n_u8(c1,7); //0x1 ++ bmasked = _mm_and_si128(b64, c1); //0x1 ++ res = vmulq_u8(a64, bmasked); ++ for(i = 1; i<8; i++) { ++ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here ++ bmasked = _mm_and_si128(b64, c1); //0x1 ++ tmp = vmulq_u8(a64, bmasked); ++ res = _mm_xor_si128(res, tmp); ++ } ++ return64 (res); ++} ++ ++int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits ++ //solution may be not optimal ++ __m128i a16, b16, r16_1, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (a16, b16); ++ //swap hi and low part of a and b to process the remaining data ++ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2 ++ ++ r16_2 = _mm_mullo_epi16 (a16, b16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit ++ ++ return _mm_unpacklo_epi64(r16_1, r16_2); ++} ++ ++int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 ++#define vmulq_s16 _mm_mullo_epi16 ++ ++int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 ++#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1 ++ ++float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 ++#define vmulq_f32 _mm_mul_ps ++ ++uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits ++ //solution may be not optimal ++ __m128i maskff, a16, b16, r16_1, r16_2; ++ maskff = _mm_set1_epi16(0xff); ++ a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1 ++ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (a16, b16); ++ r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation ++ //swap hi and low part of a and b to process the remaining data ++ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (a16, b16); ++ r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation ++ return _mm_packus_epi16 (r16_1, r16_2); ++} ++ ++uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 ++#define vmulq_u16 _mm_mullo_epi16 ++ ++uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 ++#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1 ++ ++poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 ++_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b) ++{ ++ //may be optimized ++ __m128i c1, res, tmp, bmasked; ++ int i; ++ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... ++ c1 = vshrq_n_u8(c1,7); //0x1 ++ bmasked = _mm_and_si128(b, c1); //0x1 ++ res = vmulq_u8(a, bmasked); ++ for(i = 1; i<8; i++) { ++ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here ++ bmasked = _mm_and_si128(b, c1); //0x1 ++ tmp = vmulq_u8(a, bmasked); ++ res = _mm_xor_si128(res, tmp); ++ } ++ return res; ++} ++ ++//************************* Vector long multiply *********************************** ++//**************************************************************************** ++int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0 ++{ ++ //no 8 bit simd multiply, need to go to 16 bits ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 ++ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit ++} ++ ++int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0 ++{ ++ #ifdef USE_SSE4 ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1 ++ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 ++ #else ++ __m128i low, hi, a128,b128; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ low = _mm_mullo_epi16(a128,b128); ++ hi = _mm_mulhi_epi16(a128,b128); ++ return _mm_unpacklo_epi16(low,hi); ++ #endif ++} ++ ++int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0 ++{ ++ __m128i ab, ba, a128, b128; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 ++ return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++} ++ ++uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0 ++{ ++ //no 8 bit simd multiply, need to go to 16 bits ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 ++ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit ++} ++ ++uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0 ++{ ++ #ifdef USE_SSE4 ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1 ++ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 ++ #else ++ __m128i a128,b128,low, hi; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ low = _mm_mullo_epi16(a128,b128); ++ hi = _mm_mulhi_epu16(a128,b128); ++ return _mm_unpacklo_epi16(low,hi); ++ #endif ++} ++ ++uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0 ++{ ++ ///may be not optimal compared with serial implementation ++ __m128i ab, ba, a128, b128; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 ++ return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++} ++ ++poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 ++_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) ++{ ++ //may be optimized ++ __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked; ++ int i; ++ a128 = _pM128i(a); ++ b128 = _pM128i(b); ++ c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff.... ++ c1 = vshrq_n_u8(c1,7); //0x1 ++ bmasked = _mm_and_si128(b128, c1); //0x1 ++ ++ a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1 ++ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 ++ res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit ++ for(i = 1; i<8; i++) { ++ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here ++ bmasked = _mm_and_si128(b128, c1); //0x1 ++ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 ++ tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked); ++ res = _mm_xor_si128(res, tmp); ++ } ++ return res; ++} ++ ++//****************Vector saturating doubling long multiply ************************** ++//***************************************************************** ++int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b) ++{ ++ //the serial soulution may be faster due to saturation ++ __m128i res; ++ res = vmull_s16(a, b); ++ return vqd_s32(res); ++} ++ ++int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //the serial soulution may be faster due to saturation ++ __m128i res; ++ res = vmull_s32(a,b); ++ return vqaddq_s64(res,res); //slow serial function!!!! ++} ++ ++//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************ ++//****************************************************************************************** ++int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0 ++{ ++ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits ++ int8x8_t res64; ++ __m128i b128, c128, res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits ++ c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits ++ res = _mm_mullo_epi16 (c128, b128); ++ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); ++ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits ++ return64(res); ++} ++ ++int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) ++{ ++ int16x4_t res64; ++ return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); ++} ++ ++ ++int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0 ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1 ++ res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits ++ return64(res); ++} ++ ++float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) ++{ ++ //fma is coming soon, but right now: ++ __m128 res; ++ __m64_128 res64; ++ res = _mm_mul_ps (_pM128(c), _pM128(b)); ++ res = _mm_add_ps (_pM128(a), res); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0 ++{ ++ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits ++ uint8x8_t res64; ++ __m128i mask, b128, c128, res; ++ mask = _mm_set1_epi16(0xff); ++ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits ++ c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits ++ res = _mm_mullo_epi16 (c128, b128); ++ res = _mm_and_si128(res, mask); //to avoid saturation ++ res = _mm_packus_epi16 (res, res); ++ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits ++ return64(res); ++} ++ ++uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 ++#define vmla_u16 vmla_s16 ++ ++uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 ++#define vmla_u32 vmla_s32 ++ ++int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2,r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits ++ r16_1 = _mm_add_epi8 (r16_1, a); ++ //swap hi and low part of a, b and c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_add_epi8(r16_2, a_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_mullo_epi16 (c, b); ++ return _mm_add_epi16 (res, a); ++} ++ ++int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0 ++{ ++ __m128i res; ++ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 ++ return _mm_add_epi32 (res, a); ++} ++ ++float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0 ++{ ++ //fma is coming soon, but right now: ++ __m128 res; ++ res = _mm_mul_ps (c, b); ++ return _mm_add_ps (a, res); ++} ++ ++uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits ++ r16_1 = _mm_add_epi8 (r16_1, a); ++ //swap hi and low part of a, b and c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_add_epi8(r16_2, a_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 ++#define vmlaq_u16 vmlaq_s16 ++ ++uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 ++#define vmlaq_u32 vmlaq_s32 ++ ++//********************** Vector widening multiply accumulate (long multiply accumulate): ++// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************** ++//******************************************************************************************** ++int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0 ++{ ++ int16x8_t res; ++ res = vmull_s8(b, c); ++ return _mm_add_epi16 (res, a); ++} ++ ++int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int32x4_t res; ++ res = vmull_s16(b, c); ++ return _mm_add_epi32 (res, a); ++} ++ ++int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_s32( b, c); ++ return _mm_add_epi64 (res, a); ++} ++ ++uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0 ++{ ++ uint16x8_t res; ++ res = vmull_u8(b, c); ++ return _mm_add_epi16 (res, a); ++} ++ ++uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ uint32x4_t res; ++ res = vmull_u16(b, c); ++ return _mm_add_epi32 (res, a); ++} ++ ++uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_u32( b,c); ++ return _mm_add_epi64 (res, a); ++} ++ ++//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] *************************************** ++//******************************************************************************************** ++int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0 ++{ ++ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits ++ int8x8_t res64; ++ __m128i res; ++ res64 = vmul_s8(b,c); ++ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); ++ return64(res); ++} ++ ++int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) ++{ ++ int16x4_t res64; ++ return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); ++} ++ ++ ++int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0 ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1 ++ res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only ++ return64(res); ++} ++ ++float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) ++{ ++ __m128 res; ++ __m64_128 res64; ++ res = _mm_mul_ps (_pM128(c), _pM128(b)); ++ res = _mm_sub_ps (_pM128(a), res); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) ++{ ++ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits ++ uint8x8_t res64; ++ __m128i res; ++ res64 = vmul_u8(b,c); ++ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); ++ return64(res); ++} ++ ++uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 ++#define vmls_u16 vmls_s16 ++ ++uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 ++#define vmls_u32 vmls_s32 ++ ++ ++int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); ++ r16_1 = _mm_sub_epi8 (a, r16_1); ++ //swap hi and low part of a, b, c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_sub_epi8 (a_2, r16_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_mullo_epi16 (c, b); ++ return _mm_sub_epi16 (a, res); ++} ++ ++int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0 ++{ ++ __m128i res; ++ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 ++ return _mm_sub_epi32 (a, res); ++} ++ ++float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0 ++{ ++ __m128 res; ++ res = _mm_mul_ps (c, b); ++ return _mm_sub_ps (a, res); ++} ++ ++uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0 ++{ ++ //solution may be not optimal ++ // no 8 bit simd multiply, need to go to 16 bits ++ __m128i b16, c16, r16_1, a_2, r16_2; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 ++ r16_1 = _mm_mullo_epi16 (b16, c16); ++ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits ++ r16_1 = _mm_sub_epi8 (a, r16_1); ++ //swap hi and low part of a, b and c to process the remaining data ++ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); ++ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); ++ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 ++ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 ++ ++ r16_2 = _mm_mullo_epi16 (b16, c16); ++ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); ++ r16_2 = _mm_sub_epi8(a_2, r16_2); ++ return _mm_unpacklo_epi64(r16_1,r16_2); ++} ++ ++uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 ++#define vmlsq_u16 vmlsq_s16 ++ ++uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 ++#define vmlsq_u32 vmlsq_s32 ++ ++//******************** Vector multiply subtract long (widening multiply subtract) ************************************ ++//************************************************************************************************************* ++int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0 ++{ ++ int16x8_t res; ++ res = vmull_s8(b, c); ++ return _mm_sub_epi16 (a, res); ++} ++ ++int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int32x4_t res; ++ res = vmull_s16(b, c); ++ return _mm_sub_epi32 (a, res); ++} ++ ++int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_s32( b,c); ++ return _mm_sub_epi64 (a, res); ++} ++ ++uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0 ++{ ++ uint16x8_t res; ++ res = vmull_u8(b, c); ++ return _mm_sub_epi16 (a, res); ++} ++ ++uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ uint32x4_t res; ++ res = vmull_u16(b, c); ++ return _mm_sub_epi32 (a, res); ++} ++ ++uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0 ++{ ++ //may be not optimal compared with serial implementation ++ int64x2_t res; ++ res = vmull_u32( b,c); ++ return _mm_sub_epi64 (a, res); ++} ++ ++//****** Vector saturating doubling multiply high ********************** ++//************************************************************************* ++int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int16x4_t res; ++ int32_t a32, b32, i; ++ for (i = 0; i<4; i++) { ++ a32 = (int32_t) a.m64_i16[i]; ++ b32 = (int32_t) b.m64_i16[i]; ++ a32 = (a32 * b32) >> 15; ++ res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32; ++ } ++ return res; ++} ++ ++int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster ++{ ++ //may be not optimal compared with a serial solution ++ int32x2_t res64; ++ __m128i mask; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ int64x2_t mul; ++ mul = vmull_s32(a,b); ++ mul = _mm_slli_epi64(mul,1); //double the result ++ //at this point start treating 2 64-bit numbers as 4 32-bit ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++ return64(mul); ++} ++ ++int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0 ++{ ++ __m128i res, res_lo, mask; ++ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; ++ res = _mm_mulhi_epi16 (a, b); ++ res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation ++ res_lo = _mm_mullo_epi16 (a, b); ++ res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit ++ res = _mm_add_epi16(res, res_lo); //combine results ++ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); ++ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 ++} ++ ++int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target ++ __m128i ab, ba, mask, mul, mul1; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 ++ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul = _mm_slli_epi64(mul,1); //double the result ++ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 ++ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 ++ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul1 = _mm_slli_epi64(mul1,1); //double the result ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits ++ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits ++ mul = _mm_unpacklo_epi64(mul, mul1); ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++} ++ ++//********* Vector saturating rounding doubling multiply high **************** ++//**************************************************************************** ++//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order ++int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //may be not optimal compared with a serial solution ++ int32x2_t res64; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ __m128i res_sat, mask, mask1; ++ int64x2_t mul; ++ mul = vmull_s32(a,b); ++ res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered ++ mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero ++ mul = _mm_add_epi32 (res_sat, mask1); //actual rounding ++ //at this point start treating 2 64-bit numbers as 4 32-bit ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++ return64(mul); ++} ++ ++int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0 ++{ ++ __m128i mask, res; ++ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; ++ res = _mm_mulhrs_epi16 (a, b); ++ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); ++ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 ++} ++ ++int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target ++ __m128i ab, ba, mask, mul, mul1, mask1; ++ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 ++ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 ++ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered ++ mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero ++ mul = _mm_add_epi32 (mul, mask1); //actual rounding ++ ++ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 ++ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 ++ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result ++ mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered ++ mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero ++ mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding ++ //at this point start treating 2 64-bit numbers as 4 32-bit ++ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit ++ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit ++ mul = _mm_unpacklo_epi64(mul, mul1); ++ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); ++ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 ++} ++ ++//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) ***** ++//************************************************************************************************************************* ++int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0 ++{ ++ //not optimal SIMD soulution, serial may be faster ++ __m128i res32; ++ res32 = vmull_s16(b, c); ++ res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1); ++ return vqaddq_s32(res32, a); //saturation ++} ++ ++int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res64; ++ res64 = vmull_s32(b,c); ++ res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1); ++ return vqaddq_s64(res64, a); //saturation ++} ++ ++//************************************************************************************ ++//****************** Vector subtract *********************************************** ++//************************************************************************************ ++int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_sub_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_sub_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_sub_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res64; ++ res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0]; ++ return res64; ++} ++ ++ ++float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0]; ++ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1]; ++ return res; ++} ++ ++uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 ++#define vsub_u8 vsub_s8 ++ ++uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 ++#define vsub_u16 vsub_s16 ++ ++uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 ++#define vsub_u32 vsub_s32 ++ ++ ++uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 ++_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b) ++{ ++ int64x1_t res64; ++ res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0]; ++ return res64; ++} ++ ++ ++int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 ++#define vsubq_s8 _mm_sub_epi8 ++ ++int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 ++#define vsubq_s16 _mm_sub_epi16 ++ ++int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 ++#define vsubq_s32 _mm_sub_epi32 ++ ++int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 ++#define vsubq_s64 _mm_sub_epi64 ++ ++float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 ++#define vsubq_f32 _mm_sub_ps ++ ++uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 ++#define vsubq_u8 _mm_sub_epi8 ++ ++uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 ++#define vsubq_u16 _mm_sub_epi16 ++ ++uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 ++#define vsubq_u32 _mm_sub_epi32 ++ ++uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 ++#define vsubq_u64 _mm_sub_epi64 ++ ++//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ****************** ++//*********************************************************************************** ++//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. ++int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a16, b16); ++} ++ ++int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a32, b32); ++} ++ ++int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 ++_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi64 (a64, b64); ++} ++ ++uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a16, b16); ++} ++ ++uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a32, b32); ++} ++ ++uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 ++_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0 ++{ ++ //may be not optimal ++ __m128i a64, b64; ++ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi64 (a64, b64); ++} ++ ++//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ********************************** ++//***************************************************************************************************** ++int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 ++_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a, b16); ++} ++ ++int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 ++_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a, b32); ++} ++ ++int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 ++_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_sub_epi64 (a, b64); ++} ++ ++uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 ++_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0 ++{ ++ __m128i b16; ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi16 (a, b16); ++} ++ ++uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0 ++_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0 ++{ ++ __m128i b32; ++ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, ++ return _mm_sub_epi32 (a, b32); ++} ++ ++uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 ++_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0 ++{ ++ __m128i b64; ++ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 ++ return _mm_sub_epi64 (a, b64); ++} ++ ++//************************Vector saturating subtract ********************************* ++//************************************************************************************* ++int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_subs_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_subs_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vqsubq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution ++{ ++ uint64x1_t res; ++ uint64_t a64,b64; ++ a64 = a.m64_u64[0]; ++ b64 = b.m64_u64[0]; ++ res.m64_u64[0] = a64 - b64; ++ ++ a64 = (a64 >> 63) + (~_SIGNBIT64); ++ if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) { ++ res.m64_u64[0] = a64; ++ } ++ return res; ++} ++ ++uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_subs_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_subs_epu16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vqsubq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint64x1_t res; ++ uint64_t a64, b64; ++ a64 = _Ui64(a); ++ b64 = _Ui64(b); ++ if (a64 > b64) { ++ res.m64_u64[0] = a64 - b64; ++ } else { ++ res.m64_u64[0] = 0; ++ } ++ return res; ++} ++ ++int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 ++#define vqsubq_s8 _mm_subs_epi8 ++ ++int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 ++#define vqsubq_s16 _mm_subs_epi16 ++ ++int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b) ++{ ++ //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a ++ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a; ++ c7fffffff = _mm_set1_epi32(0x7fffffff); ++ res = _mm_sub_epi32(a, b); ++ res_sat = _mm_srli_epi32(a, 31); ++ res_sat = _mm_add_epi32(res_sat, c7fffffff); ++ res_xor_a = _mm_xor_si128(res, a); ++ b_xor_a = _mm_xor_si128(b, a); ++ res_xor_a = _mm_and_si128(b_xor_a, res_xor_a); ++ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise ++ res_sat = _mm_and_si128(res_xor_a, res_sat); ++ res = _mm_andnot_si128(res_xor_a, res); ++ return _mm_or_si128(res, res_sat); ++} ++ ++int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution ++{ ++ _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2]; ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = atmp[0] - btmp[0]; ++ res[1] = atmp[1] - btmp[1]; ++ if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) { ++ res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64; ++ } ++ if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) { ++ res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64; ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 ++#define vqsubq_u8 _mm_subs_epu8 ++ ++uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0 ++#define vqsubq_u16 _mm_subs_epu16 ++ ++uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0 ++{ ++ __m128i min, mask, sub; ++ min = _MM_MIN_EPU32(a, b); //SSE4.1 ++ mask = _mm_cmpeq_epi32 (min, b); ++ sub = _mm_sub_epi32 (a, b); ++ return _mm_and_si128 ( sub, mask); ++} ++ ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b) ++ { ++ __m128i c80000000, subb, suba, cmp, sub; ++ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); ++ sub = _mm_sub_epi64 (a, b); ++ suba = _mm_sub_epi64 (a, c80000000); ++ subb = _mm_sub_epi64 (b, c80000000); ++ cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!! ++ return _mm_and_si128 (sub, cmp); //saturation ++ } ++#else ++ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++ { ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ _mm_store_si128((__m128i*)btmp, b); ++ res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0; ++ res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0; ++ return _mm_load_si128((__m128i*)(res)); ++ } ++#endif ++ ++//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ****************************************************** ++//**************************************************************** ++int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0 ++{ ++ //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit, ++ int8x8_t res64; ++ __m128i r16; ++ int8x8_t r; ++ r = vsub_s8 (a, b); ++ r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1 ++ r16 = _mm_srai_epi16 (r16, 1); //SSE2 ++ r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits ++ return64(r16); ++} ++ ++int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vhsubq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++ ++int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vhsubq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vhsubq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vhsubq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vhsubq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0 ++{ ++ // //need to deal with the possibility of internal overflow ++ __m128i c128, au,bu; ++ c128 = _mm_set1_epi8 (128); ++ au = _mm_add_epi8( a, c128); ++ bu = _mm_add_epi8( b, c128); ++ return vhsubq_u8(au,bu); ++} ++ ++int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0 ++{ ++ //need to deal with the possibility of internal overflow ++ __m128i c8000, au,bu; ++ c8000 = _mm_set1_epi16(0x8000); ++ au = _mm_add_epi16( a, c8000); ++ bu = _mm_add_epi16( b, c8000); ++ return vhsubq_u16(au,bu); ++} ++ ++int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0 ++{ ++ //need to deal with the possibility of internal overflow ++ __m128i a2, b2,r, b_1; ++ a2 = _mm_srai_epi32 (a,1); ++ b2 = _mm_srai_epi32 (b,1); ++ r = _mm_sub_epi32 (a2, b2); ++ b_1 = _mm_andnot_si128(a, b); //!a and b ++ b_1 = _mm_slli_epi32 (b_1,31); ++ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit ++ return _mm_sub_epi32(r,b_1); ++} ++ ++uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0 ++{ ++ __m128i avg; ++ avg = _mm_avg_epu8 (a, b); ++ return _mm_sub_epi8(a, avg); ++} ++ ++uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0 ++{ ++ __m128i avg; ++ avg = _mm_avg_epu16 (a, b); ++ return _mm_sub_epi16(a, avg); ++} ++ ++uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0 ++{ ++ //need to deal with the possibility of internal overflow ++ __m128i a2, b2,r, b_1; ++ a2 = _mm_srli_epi32 (a,1); ++ b2 = _mm_srli_epi32 (b,1); ++ r = _mm_sub_epi32 (a2, b2); ++ b_1 = _mm_andnot_si128(a, b); //!a and b ++ b_1 = _mm_slli_epi32 (b_1,31); ++ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit ++ return _mm_sub_epi32(r,b_1); ++} ++ ++//******* Vector subtract high half (truncated) ** ************ ++//************************************************************ ++int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sum, sum8; ++ sum = _mm_sub_epi16 (a, b); ++ sum8 = _mm_srai_epi16 (sum, 8); ++ sum8 = _mm_packs_epi16(sum8,sum8); ++ return64(sum8); ++} ++ ++int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0 ++{ ++ int16x4_t res64; ++ __m128i sum, sum16; ++ sum = _mm_sub_epi32 (a, b); ++ sum16 = _mm_srai_epi32 (sum, 16); ++ sum16 = _mm_packs_epi32(sum16,sum16); ++ return64(sum16); ++} ++ ++int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b) ++{ ++ int32x2_t res64; ++ __m128i sub; ++ sub = _mm_sub_epi64 (a, b); ++ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); ++ return64(sub); ++} ++ ++uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sum, sum8; ++ sum = _mm_sub_epi16 (a, b); ++ sum8 = _mm_srli_epi16 (sum, 8); ++ sum8 = _mm_packus_epi16(sum8,sum8); ++ return64(sum8); ++} ++ ++uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0 ++{ ++ uint16x4_t res64; ++ __m128i sum, sum16; ++ sum = _mm_sub_epi32 (a, b); ++ sum16 = _mm_srli_epi32 (sum, 16); ++ sum16 = _MM_PACKUS1_EPI32(sum16); ++ return64(sum16); ++} ++ ++uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 ++#define vsubhn_u64 vsubhn_s64 ++ ++//************ Vector rounding subtract high half ********************* ++//********************************************************************* ++int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0 ++{ ++ int8x8_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sub = _mm_srai_epi16 (sub, 8); //get high half ++ sub = _mm_add_epi16 (sub, mask1); //actual rounding ++ sub = _mm_packs_epi16 (sub, sub); ++ return64(sub); ++} ++ ++int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0 ++{ ++ //SIMD may be not optimal, serial may be faster ++ int16x4_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sub = _mm_srai_epi32 (sub, 16); //get high half ++ sub = _mm_add_epi32 (sub, mask1); //actual rounding ++ sub = _mm_packs_epi32 (sub, sub); ++ return64(sub); ++} ++ ++int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b) ++{ ++ //SIMD may be not optimal, serial may be faster ++ int32x2_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi64 (a, b); ++ mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to ++ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero ++ sub = _mm_add_epi64 (sub, mask1); //actual high half rounding ++ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); ++ return64(sub); ++} ++ ++uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 ++_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0 ++{ ++ uint8x8_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi16 (a, b); ++ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to ++ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero ++ sub = _mm_srai_epi16 (sub, 8); //get high half ++ sub = _mm_add_epi16 (sub, mask1); //actual rounding ++ sub = _mm_packus_epi16 (sub, sub); ++ return64(sub); ++} ++ ++uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 ++_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0 ++{ ++ //SIMD may be not optimal, serial may be faster ++ uint16x4_t res64; ++ __m128i sub, mask1; ++ sub = _mm_sub_epi32 (a, b); ++ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to ++ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero ++ sub = _mm_srai_epi32 (sub, 16); //get high half ++ sub = _mm_add_epi32 (sub, mask1); //actual rounding ++ sub = _MM_PACKUS1_EPI32 (sub); ++ return64(sub); ++} ++ ++uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 ++#define vrsubhn_u64 vrsubhn_s64 ++ ++//*********** Vector saturating doubling multiply subtract long ******************** ++//************************************************************************************ ++int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) ++{ ++ //not optimal SIMD soulution, serial may be faster ++ __m128i res32, mask; ++ int32x4_t res; ++ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ res = vmull_s16(b, c); ++ res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered ++ mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask); ++ res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000 ++ return vqsubq_s32(a, res32); //saturation ++} ++ ++int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res64, mask; ++ int64x2_t res; ++ _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000}; ++ res = vmull_s32(b, c); ++ res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered ++ mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask); ++ res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000 ++ return vqsubq_s64(a, res64); //saturation ++} ++ ++//****************** COMPARISON *************************************** ++//******************* Vector compare equal ************************************* ++//**************************************************************************** ++uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmpeq_ps(_pM128(a), _pM128(b) ); ++ return64f(res); ++} ++ ++uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 ++#define vceq_p8 vceq_u8 ++ ++ ++uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 ++#define vceqq_s8 _mm_cmpeq_epi8 ++ ++uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 ++#define vceqq_s16 _mm_cmpeq_epi16 ++ ++uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 ++#define vceqq_s32 _mm_cmpeq_epi32 ++ ++uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmpeq_ps(a,b); ++ return _M128i(res); ++} ++ ++uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 ++#define vceqq_u8 _mm_cmpeq_epi8 ++ ++uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 ++#define vceqq_u16 _mm_cmpeq_epi16 ++ ++uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 ++#define vceqq_u32 _mm_cmpeq_epi32 ++ ++uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 ++#define vceqq_p8 _mm_cmpeq_epi8 ++ ++//******************Vector compare greater-than or equal************************* ++//******************************************************************************* ++//in IA SIMD no greater-than-or-equal comparison for integers, ++// there is greater-than available only, so we need the following tricks ++ ++uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vcgeq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vcgeq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vcgeq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries ++ return64f(res); ++} ++ ++uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vcgeq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vcgeq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b) ++{ ++ //serial solution looks faster ++ uint32x2_t res64; ++ return64(vcgeq_u32 (_pM128i(a), _pM128i(b))); ++} ++ ++ ++ ++uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 ++{ ++ __m128i m1, m2; ++ m1 = _mm_cmpgt_epi8 ( a, b); ++ m2 = _mm_cmpeq_epi8 ( a, b); ++ return _mm_or_si128 ( m1, m2); ++} ++ ++uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 ++{ ++ __m128i m1, m2; ++ m1 = _mm_cmpgt_epi16 ( a, b); ++ m2 = _mm_cmpeq_epi16 ( a, b); ++ return _mm_or_si128 ( m1,m2); ++} ++ ++uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 ++{ ++ __m128i m1, m2; ++ m1 = _mm_cmpgt_epi32 (a, b); ++ m2 = _mm_cmpeq_epi32 (a, b); ++ return _mm_or_si128 (m1, m2); ++} ++ ++uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmpge_ps(a,b); //use only 2 first entries ++ return *(__m128i*)&res; ++} ++ ++uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 ++{ ++ //no unsigned chars comparison, only signed available,so need the trick ++ #ifdef USE_SSE4 ++ __m128i cmp; ++ cmp = _mm_max_epu8(a, b); ++ return _mm_cmpeq_epi8(cmp, a); //a>=b ++ #else ++ __m128i c128, as, bs, m1, m2; ++ c128 = _mm_set1_epi8 (128); ++ as = _mm_sub_epi8( a, c128); ++ bs = _mm_sub_epi8( b, c128); ++ m1 = _mm_cmpgt_epi8( as, bs); ++ m2 = _mm_cmpeq_epi8 (as, bs); ++ return _mm_or_si128 ( m1, m2); ++ #endif ++} ++ ++uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 ++{ ++ //no unsigned shorts comparison, only signed available,so need the trick ++ #ifdef USE_SSE4 ++ __m128i cmp; ++ cmp = _mm_max_epu16(a, b); ++ return _mm_cmpeq_epi16(cmp, a); //a>=b ++ #else ++ __m128i c8000, as, bs, m1, m2; ++ c8000 = _mm_set1_epi16 (0x8000); ++ as = _mm_sub_epi16(a,c8000); ++ bs = _mm_sub_epi16(b,c8000); ++ m1 = _mm_cmpgt_epi16(as, bs); ++ m2 = _mm_cmpeq_epi16 (as, bs); ++ return _mm_or_si128 ( m1, m2); ++ #endif ++} ++ ++uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 ++{ ++ //no unsigned ints comparison, only signed available,so need the trick ++ #ifdef USE_SSE4 ++ __m128i cmp; ++ cmp = _mm_max_epu32(a, b); ++ return _mm_cmpeq_epi32(cmp, a); //a>=b ++ #else ++ //serial solution may be faster ++ __m128i c80000000, as, bs, m1, m2; ++ c80000000 = _mm_set1_epi32 (0x80000000); ++ as = _mm_sub_epi32(a,c80000000); ++ bs = _mm_sub_epi32(b,c80000000); ++ m1 = _mm_cmpgt_epi32 (as, bs); ++ m2 = _mm_cmpeq_epi32 (as, bs); ++ return _mm_or_si128 ( m1, m2); ++ #endif ++} ++ ++//**********************Vector compare less-than or equal****************************** ++//*************************************************************************************** ++//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks ++ ++uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vcleq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vcleq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vcleq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0? ++_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmple_ps(_pM128(a),_pM128(b)); ++ return64f(res); ++} ++ ++uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 ++#define vcle_u8(a,b) vcge_u8(b,a) ++ ++ ++uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 ++#define vcle_u16(a,b) vcge_u16(b,a) ++ ++ ++uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 ++#define vcle_u32(a,b) vcge_u32(b,a) ++ ++uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 ++{ ++ __m128i c1, res; ++ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... ++ res = _mm_cmpgt_epi8 ( a, b); ++ return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal ++} ++ ++uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 ++{ ++ __m128i c1, res; ++ c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff.... ++ res = _mm_cmpgt_epi16 ( a, b); ++ return _mm_andnot_si128 (res, c1); ++} ++ ++uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 ++{ ++ __m128i c1, res; ++ c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff.... ++ res = _mm_cmpgt_epi32 ( a, b); ++ return _mm_andnot_si128 (res, c1); ++} ++ ++uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmple_ps(a,b); ++ return *(__m128i*)&res; ++} ++ ++uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 ++ { ++ //no unsigned chars comparison in SSE, only signed available,so need the trick ++ __m128i cmp; ++ cmp = _mm_min_epu8(a, b); ++ return _mm_cmpeq_epi8(cmp, a); //a<=b ++ } ++#else ++ #define vcleq_u8(a,b) vcgeq_u8(b,a) ++#endif ++ ++ ++uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 ++ { ++ //no unsigned shorts comparison in SSE, only signed available,so need the trick ++ __m128i cmp; ++ cmp = _mm_min_epu16(a, b); ++ return _mm_cmpeq_epi16(cmp, a); //a<=b ++ } ++#else ++ #define vcleq_u16(a,b) vcgeq_u16(b,a) ++#endif ++ ++ ++uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 ++#ifdef USE_SSE4 ++ _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 ++ { ++ //no unsigned chars comparison in SSE, only signed available,so need the trick ++ __m128i cmp; ++ cmp = _mm_min_epu32(a, b); ++ return _mm_cmpeq_epi32(cmp, a); //a<=b ++ } ++#else ++//solution may be not optimal compared with the serial one ++ #define vcleq_u32(a,b) vcgeq_u32(b,a) ++#endif ++ ++ ++//****** Vector compare greater-than ****************************************** ++//************************************************************************** ++uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128 res; ++ res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries ++ return64f(res); ++} ++ ++uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vcgtq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vcgtq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vcgtq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++#define vcgtq_s8 _mm_cmpgt_epi8 ++ ++uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++#define vcgtq_s16 _mm_cmpgt_epi16 ++ ++uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++#define vcgtq_s32 _mm_cmpgt_epi32 ++ ++uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b) ++{ ++ __m128 res; ++ res = _mm_cmpgt_ps(a,b); //use only 2 first entries ++ return *(__m128i*)&res; ++} ++ ++uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0 ++{ ++ //no unsigned chars comparison, only signed available,so need the trick ++ __m128i c128, as, bs; ++ c128 = _mm_set1_epi8 (128); ++ as = _mm_sub_epi8(a,c128); ++ bs = _mm_sub_epi8(b,c128); ++ return _mm_cmpgt_epi8 (as, bs); ++} ++ ++uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0 ++{ ++ //no unsigned short comparison, only signed available,so need the trick ++ __m128i c8000, as, bs; ++ c8000 = _mm_set1_epi16 (0x8000); ++ as = _mm_sub_epi16(a,c8000); ++ bs = _mm_sub_epi16(b,c8000); ++ return _mm_cmpgt_epi16 ( as, bs); ++} ++ ++uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0 ++{ ++ //no unsigned int comparison, only signed available,so need the trick ++ __m128i c80000000, as, bs; ++ c80000000 = _mm_set1_epi32 (0x80000000); ++ as = _mm_sub_epi32(a,c80000000); ++ bs = _mm_sub_epi32(b,c80000000); ++ return _mm_cmpgt_epi32 ( as, bs); ++} ++ ++//********************* Vector compare less-than ************************** ++//************************************************************************* ++uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 ++#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!! ++ ++ ++uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 ++#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!! ++ ++ ++uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 ++#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!! ++ ++ ++uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 ++#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!! ++ ++uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 ++#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!! ++ ++uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 ++#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!! ++ ++uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 ++#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!! ++ ++uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 ++#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!! ++ ++uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 ++#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!! ++ ++uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 ++#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!! ++ ++uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 ++#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!! ++ ++uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 ++#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!! ++ ++uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 ++#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!! ++ ++uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 ++#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!! ++ ++//*****************Vector compare absolute greater-than or equal ************ ++//*************************************************************************** ++uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmpge_ps ( a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmpge_ps ( a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//********Vector compare absolute less-than or equal ****************** ++//******************************************************************** ++uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmple_ps (a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmple_ps (a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//******** Vector compare absolute greater-than ****************** ++//****************************************************************** ++uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmpgt_ps (a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmpgt_ps (a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//***************Vector compare absolute less-than *********************** ++//************************************************************************* ++uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); ++ a0 = _mm_cmplt_ps (a0, b0); ++ return64f(a0); ++} ++ ++uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 ++{ ++ __m128i c7fffffff; ++ __m128 a0, b0; ++ c7fffffff = _mm_set1_epi32 (0x7fffffff); ++ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); ++ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); ++ a0 = _mm_cmplt_ps (a0, b0); ++ return (*(__m128i*)&a0); ++} ++ ++//*************************Vector test bits************************************ ++//***************************************************************************** ++/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them ++with the corresponding element of a second vector. If the result is not zero, the ++corresponding element in the destination vector is set to all ones. Otherwise, it is set to ++all zeros. */ ++ ++uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 ++_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vtstq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 ++_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vtstq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 ++_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vtstq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 ++#define vtst_u8 vtst_s8 ++ ++uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 ++#define vtst_u16 vtst_s16 ++ ++uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 ++#define vtst_u32 vtst_s32 ++ ++ ++uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 ++#define vtst_p8 vtst_u8 ++ ++uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 ++_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0 ++{ ++ __m128i zero, one, res; ++ zero = _mm_setzero_si128 (); ++ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff ++ res = _mm_and_si128 (a, b); ++ res = _mm_cmpeq_epi8 (res, zero); ++ return _mm_xor_si128(res, one); //invert result ++} ++ ++uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 ++_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0 ++{ ++ __m128i zero, one, res; ++ zero = _mm_setzero_si128 (); ++ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff ++ res = _mm_and_si128 (a, b); ++ res = _mm_cmpeq_epi16 (res, zero); ++ return _mm_xor_si128(res, one); //invert result ++} ++ ++uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 ++_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0 ++{ ++ __m128i zero, one, res; ++ zero = _mm_setzero_si128 (); ++ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff ++ res = _mm_and_si128 (a, b); ++ res = _mm_cmpeq_epi32 (res, zero); ++ return _mm_xor_si128(res, one); //invert result ++} ++ ++uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 ++#define vtstq_u8 vtstq_s8 ++ ++uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 ++#define vtstq_u16 vtstq_s16 ++ ++uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 ++#define vtstq_u32 vtstq_s32 ++ ++uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 ++#define vtstq_p8 vtstq_u8 ++ ++//****************** Absolute difference ******************** ++//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |***** ++//************************************************************ ++int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vabdq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vabdq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vabdq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(vabdq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(vabdq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ return64(vabdq_u32(_pM128i(a), _pM128i(b))); ++} ++ ++float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = vabdq_f32(_pM128(a), _pM128(b)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_sub_epi8 (a, b); ++ return _mm_abs_epi8 (res); ++} ++ ++int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_sub_epi16 (a,b); ++ return _mm_abs_epi16 (res); ++} ++ ++int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0 ++{ ++ __m128i res; ++ res = _mm_sub_epi32 (a,b); ++ return _mm_abs_epi32 (res); ++} ++ ++uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned ++{ ++ __m128i cmp, difab, difba; ++ cmp = vcgtq_u8(a,b); ++ difab = _mm_sub_epi8(a,b); ++ difba = _mm_sub_epi8 (b,a); ++ difab = _mm_and_si128(cmp, difab); ++ difba = _mm_andnot_si128(cmp, difba); ++ return _mm_or_si128(difab, difba); ++} ++ ++uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) ++{ ++ __m128i cmp, difab, difba; ++ cmp = vcgtq_u16(a,b); ++ difab = _mm_sub_epi16(a,b); ++ difba = _mm_sub_epi16 (b,a); ++ difab = _mm_and_si128(cmp, difab); ++ difba = _mm_andnot_si128(cmp, difba); ++ return _mm_or_si128(difab, difba); ++} ++ ++uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) ++{ ++ __m128i cmp, difab, difba; ++ cmp = vcgtq_u32(a,b); ++ difab = _mm_sub_epi32(a,b); ++ difba = _mm_sub_epi32 (b,a); ++ difab = _mm_and_si128(cmp, difab); ++ difba = _mm_andnot_si128(cmp, difba); ++ return _mm_or_si128(difab, difba); ++} ++ ++float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0 ++{ ++ __m128i c1; ++ __m128 res; ++ c1 = _mm_set1_epi32(0x7fffffff); ++ res = _mm_sub_ps (a, b); ++ return _mm_and_ps (res, *(__m128*)&c1); ++} ++ ++//************ Absolute difference - long ************************** ++//******************************************************************** ++int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0 ++{ ++ __m128i a16, b16; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ return vabdq_s16(a16, b16); ++ ++} ++ ++int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0 ++{ ++ __m128i a32, b32; ++ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 ++ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, ++ return vabdq_s32(a32, b32); ++} ++ ++int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //no optimal SIMD solution, serial looks faster ++ _NEON2SSE_ALIGN_16 int64_t res[2]; ++ if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0]; ++ else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0]; ++ if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1]; ++ else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) ++{ ++ __m128i res; ++ res = vsubl_u8(a,b); ++ return _mm_abs_epi16(res); ++} ++ ++uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) ++{ ++ __m128i res; ++ res = vsubl_u16(a,b); ++ return _mm_abs_epi32(res); ++} ++ ++uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0]; ++ else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0]; ++ if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1]; ++ else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ************* ++//********************************************************************************************* ++int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) ++{ ++ int8x8_t res64; ++ return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c))); ++} ++ ++int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) ++{ ++ int16x4_t res64; ++ return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c))); ++} ++ ++int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) ++{ ++ int32x2_t res64; ++ return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c))); ++} ++ ++uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 ++#define vaba_u8 vaba_s8 ++ ++ ++uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0 ++#define vaba_u16 vaba_s16 ++ ++uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) ++{ ++ uint32x2_t res64; ++ return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c))); ++} ++ ++int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0 ++{ ++ int8x16_t sub; ++ sub = vabdq_s8(b, c); ++ return vaddq_s8( a, sub); ++} ++ ++int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0 ++{ ++ int16x8_t sub; ++ sub = vabdq_s16(b, c); ++ return vaddq_s16( a, sub); ++} ++ ++int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0 ++{ ++ int32x4_t sub; ++ sub = vabdq_s32(b, c); ++ return vaddq_s32( a, sub); ++} ++ ++uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) ++{ ++ uint8x16_t sub; ++ sub = vabdq_u8(b, c); ++ return vaddq_u8( a, sub); ++} ++ ++uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) ++{ ++ uint16x8_t sub; ++ sub = vabdq_u16(b, c); ++ return vaddq_u16( a, sub); ++} ++ ++uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) ++{ ++ uint32x4_t sub; ++ sub = vabdq_u32(b, c); ++ return vaddq_u32( a, sub); ++} ++ ++//************** Absolute difference and accumulate - long ******************************** ++//************************************************************************************* ++int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 ++_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0 ++{ ++ __m128i b16, c16, res; ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, ++ c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1, ++ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); ++ return _mm_add_epi16 (a, res); ++} ++ ++int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 ++_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0 ++{ ++ __m128i b32, c32, res; ++ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1 ++ c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1 ++ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); ++ return _mm_add_epi32 (a, res); ++} ++ ++int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res; ++ res = vabdl_s32(b,c); ++ return _mm_add_epi64(a, res); ++} ++ ++uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 ++_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) ++{ ++ __m128i b16, c16, res; ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, ++ c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1, ++ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); ++ return _mm_add_epi16 (a, res); ++} ++ ++uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0 ++_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) ++{ ++ __m128i b32, c32, res; ++ b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1 ++ c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1 ++ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); ++ return _mm_add_epi32 (a, res); ++} ++ ++uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ __m128i res; ++ res = vabdl_u32(b,c); ++ return _mm_add_epi64(a, res); ++} ++ ++//*********************************************************************************** ++//**************** Maximum and minimum operations ********************************** ++//*********************************************************************************** ++//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ******* ++//*********************************************************************************** ++int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_max_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_max_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i res; ++ res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial ++ return64(res); ++} ++ ++float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; ++ res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; ++ return res; ++} ++ ++int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 ++#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1 ++ ++int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 ++#define vmaxq_s16 _mm_max_epi16 ++ ++int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 ++#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1 ++ ++uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 ++#define vmaxq_u8 _mm_max_epu8 ++ ++uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0 ++#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1 ++ ++uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 ++#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1 ++ ++ ++float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 ++#define vmaxq_f32 _mm_max_ps ++ ++//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** ++//*********************************************************************************************************** ++int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_min_epi16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits ++ return64(res); ++} ++ ++uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ return64(_mm_min_epu8(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b) ++{ ++ uint16x4_t res64; ++ return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b))); ++} ++ ++ ++uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b) ++{ ++ uint32x2_t res64; ++ __m128i res; ++ res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial ++ return64(res); ++} ++ ++float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; ++ res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; ++ return res; ++} ++ ++int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 ++#define vminq_s8 _MM_MIN_EPI8 //SSE4.1 ++ ++int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 ++#define vminq_s16 _mm_min_epi16 ++ ++int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 ++#define vminq_s32 _MM_MIN_EPI32 //SSE4.1 ++ ++uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 ++#define vminq_u8 _mm_min_epu8 ++ ++uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0 ++#define vminq_u16 _MM_MIN_EPU16 //SSE4.1 ++ ++uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 ++#define vminq_u32 _MM_MIN_EPU32 //SSE4.1 ++ ++float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 ++#define vminq_f32 _mm_min_ps ++ ++//************* Pairwise addition operations. ************************************** ++//************************************************************************************ ++//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector ++int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit and then pack ++ int8x8_t res64; ++ __m128i a16, b16, res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 ++ res = _mm_hadd_epi16 (a16, b16); ++ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits ++ return64(res); ++} ++ ++int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ __m128i hadd128; ++ hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b)); ++ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(hadd128); ++} ++ ++ ++int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ __m128i hadd128; ++ hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b)); ++ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(hadd128); ++} ++ ++ ++uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0 ++{ ++ // no 8 bit hadd in IA32, need to go to 16 bit and then pack ++ uint8x8_t res64; ++// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work ++ __m128i mask8, a16, b16, res; ++ mask8 = _mm_set1_epi16(0xff); ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 ++ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 ++ res = _mm_hadd_epi16 (a16, b16); ++ res = _mm_and_si128(res, mask8); //to avoid saturation ++ res = _mm_packus_epi16 (res,res); //use low 64 bits ++ return64(res); ++} ++ ++uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0 ++{ ++ // solution may be not optimal, serial execution may be faster ++ // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed ++ uint16x4_t res64; ++ __m128i c32767, cfffe, as, bs, res; ++ c32767 = _mm_set1_epi16 (32767); ++ cfffe = _mm_set1_epi16 (0xfffe); ++ as = _mm_sub_epi16 (_pM128i(a), c32767); ++ bs = _mm_sub_epi16 (_pM128i(b), c32767); ++ res = _mm_hadd_epi16 (as, bs); ++ res = _mm_add_epi16 (res, cfffe); ++ res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(res); ++} ++ ++uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 ++_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster ++{ ++ //hadd doesn't work for unsigned values ++ uint32x2_t res64; ++ __m128i ab, ab_sh, res; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1 ++ ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0 ++ res = _mm_add_epi32(ab, ab_sh); ++ res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ return64(res); ++} ++ ++float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b) ++{ ++ __m128 hadd128; ++ __m64_128 res64; ++ hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b)); ++ hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits ++ _M64f(res64, hadd128); ++ return res64; ++} ++ ++ ++//************************** Long pairwise add ********************************** ++//********************************************************************************* ++//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width, ++// and places the final results in the destination vector. ++ ++int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 ++_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit anyway ++ __m128i a16; ++ int16x4_t res64; ++ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 ++ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits ++ return64(a16); ++} ++ ++int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 ++_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0 ++{ ++ // solution may be not optimal, serial execution may be faster ++ int32x2_t res64; ++ __m128i r32_1; ++ r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a)); ++ r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits ++ return64(r32_1); ++} ++ ++int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1]; ++ return res; ++} ++ ++uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0 ++{ ++ // no 8 bit hadd in IA32, need to go to 16 bit ++// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work ++ uint16x4_t res64; ++ __m128i a16; ++ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits ++ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits ++ return64(a16); ++} ++ ++uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than a SIMD one ++ uint32x2_t res; ++ res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1]; ++ res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3]; ++ return res; ++} ++ ++uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1]; ++ return res; ++} ++ ++int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 ++_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit ++ __m128i r16_1, r16_2; ++ r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 ++ //swap hi and low part of r to process the remaining data ++ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ r16_2 = _MM_CVTEPI8_EPI16 (r16_2); ++ return _mm_hadd_epi16 (r16_1, r16_2); ++} ++ ++int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 ++_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit ++ __m128i r32_1, r32_2; ++ r32_1 = _MM_CVTEPI16_EPI32(a); ++ //swap hi and low part of r to process the remaining data ++ r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ r32_2 = _MM_CVTEPI16_EPI32 (r32_2); ++ return _mm_hadd_epi32 (r32_1, r32_2); ++} ++ ++int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0 ++{ ++ _NEON2SSE_ALIGN_16 int32_t atmp[4]; ++ _NEON2SSE_ALIGN_16 int64_t res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ res[0] = (int64_t)atmp[0] + (int64_t)atmp[1]; ++ res[1] = (int64_t)atmp[2] + (int64_t)atmp[3]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 ++_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0 ++{ ++ //no 8 bit hadd in IA32, need to go to 16 bit ++ __m128i r16_1, r16_2; ++ r16_1 = _MM_CVTEPU8_EPI16(a); ++ //swap hi and low part of r to process the remaining data ++ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ r16_2 = _MM_CVTEPU8_EPI16 (r16_2); ++ return _mm_hadd_epi16 (r16_1, r16_2); ++} ++ ++uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than a SIMD one ++ _NEON2SSE_ALIGN_16 uint16_t atmp[8]; ++ _NEON2SSE_ALIGN_16 uint32_t res[4]; ++ _mm_store_si128((__m128i*)atmp, a); ++ res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1]; ++ res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3]; ++ res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5]; ++ res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1]; ++ res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3]; ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//************************ Long pairwise add and accumulate ************************** ++//**************************************************************************************** ++//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector, ++// and accumulates the values of the results into the elements of the destination (wide) vector ++int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 ++_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) ++{ ++ int16x4_t res64; ++ return64(vpadalq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 ++_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) ++{ ++ int32x2_t res64; ++ return64(vpadalq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 ++_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0]; ++ return res; ++} ++ ++uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) ++{ ++ uint16x4_t res64; ++ return64(vpadalq_u8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0 ++_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) ++{ ++ uint32x2_t res64; ++ return64(vpadalq_u16(_pM128i(a), _pM128i(b))); ++} ++ ++uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 ++_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0]; ++ return res; ++} ++ ++int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 ++_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0 ++{ ++ int16x8_t pad; ++ pad = vpaddlq_s8(b); ++ return _mm_add_epi16 (a, pad); ++} ++ ++int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 ++_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0 ++{ ++ int32x4_t pad; ++ pad = vpaddlq_s16(b); ++ return _mm_add_epi32(a, pad); ++} ++ ++int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 ++_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) ++{ ++ int64x2_t pad; ++ pad = vpaddlq_s32(b); ++ return _mm_add_epi64 (a, pad); ++} ++ ++uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 ++_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0 ++{ ++ uint16x8_t pad; ++ pad = vpaddlq_u8(b); ++ return _mm_add_epi16 (a, pad); ++} ++ ++uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x4_t pad; ++ pad = vpaddlq_u16(b); ++ return _mm_add_epi32(a, pad); ++} //no optimal SIMD solution, serial is faster ++ ++uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //no optimal SIMD solution, serial is faster ++ uint64x2_t pad; ++ pad = vpaddlq_u32(b); ++ return _mm_add_epi64(a, pad); ++} //no optimal SIMD solution, serial is faster ++ ++//********** Folding maximum ************************************* ++//******************************************************************* ++//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors, ++//and copies the larger of each pair into the corresponding element in the destination ++// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison ++int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0 ++{ ++ int8x8_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding ++ max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1 ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(max); //we need 64 bits only ++} ++ ++int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ int16x4_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask ++ max = _mm_max_epi16 (ab, ab1); ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(max); ++} ++ ++int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ int32x2_t res; ++ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; ++ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; ++ return res; ++} ++ ++uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0 ++{ ++ uint8x8_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding ++ max = _mm_max_epu8 (ab, ab1); // SSE4.1 ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(max); ++} ++ ++uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ uint16x4_t res64; ++ __m128i ab, ab1, max; ++ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask ++ max = _MM_MAX_EPU16 (ab, ab1); ++ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(max); ++} ++ ++uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ uint32x2_t res; ++ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; ++ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; ++ return res; ++} //serial solution looks faster than a SIMD one ++ ++float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; ++ res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; ++ return res; ++} ++ ++// ***************** Folding minimum **************************** ++// ************************************************************** ++//vpmin -> takes minimum of adjacent pairs ++int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0 ++{ ++ int8x8_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding ++ min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1 ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(min); ++} ++ ++int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ int16x4_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask ++ min = _mm_min_epi16 (ab, ab1); ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(min); ++} ++ ++int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ int32x2_t res; ++ res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; ++ res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; ++ return res; ++} ++ ++uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 ++_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0 ++{ ++ uint8x8_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; ++ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding ++ min = _mm_min_epu8 (ab, ab1); // SSE4.1 ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data ++ return64(min); ++} ++ ++uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0 ++_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0 ++{ ++ //solution may be not optimal compared with the serial one ++ uint16x4_t res64; ++ __m128i ab, ab1, min; ++ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number ++ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; ++ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab ++ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask ++ min = _MM_MIN_EPU16 (ab, ab1); ++ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask ++ return64(min); ++} ++ ++uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ uint32x2_t res; ++ res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0]; ++ res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0]; ++ return res; ++} ++ ++float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution looks faster than SIMD one ++ float32x2_t res; ++ res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; ++ res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; ++ return res; ++} ++ ++//*************************************************************** ++//*********** Reciprocal/Sqrt ************************************ ++//*************************************************************** ++//****************** Reciprocal estimate ******************************* ++//the ARM NEON and x86 SIMD results may be slightly different ++float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 ++_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = _mm_rcp_ps(_pM128(a)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! No reciprocal for ints in IA32 available ++ uint32x2_t res; ++ float resf, r; ++ int i, q, s; ++ for (i =0; i<2; i++){ ++ if((a.m64_u32[i] & 0x80000000) == 0) { ++ res.m64_u32[i] = 0xffffffff; ++ }else{ ++ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); ++ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ ++ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res.m64_u32[i] = r * (uint32_t)(1 << 31); ++ } ++ } ++ return res; ++} ++ ++float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 ++#define vrecpeq_f32 _mm_rcp_ps ++ ++ ++uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! ++ //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double ++ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; ++ _NEON2SSE_ALIGN_16 uint32_t res[4]; ++ _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000}; ++ float resf, r; ++ int i, q, s; ++ __m128i res128, mask, zero; ++ _mm_store_si128((__m128i*)atmp, a); ++ zero = _mm_setzero_si128(); ++ for (i =0; i<4; i++){ ++ resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31)) ++ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ ++ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); ++ } ++ res128 = _mm_load_si128((__m128i*)res); ++ mask = _mm_and_si128(a, *(__m128i*)c80000000); ++ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff ++ return _mm_or_si128(res128, mask); ++} ++ ++//**********Reciprocal square root estimate **************** ++//********************************************************** ++//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster ++//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers ++////the ARM NEON and x86 SIMD results may be slightly different ++float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 ++_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = _mm_rsqrt_ps(_pM128(a)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! ++ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double ++ uint32x2_t res; ++ __m128 tmp; ++ float r, resf, coeff; ++ int i,q0, q1, s;; ++ for (i =0; i<2; i++){ ++ if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff ++ res.m64_u32[i] = 0xffffffff; ++ }else{ ++ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); ++ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ ++ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ ++ r = ((float)q0 + 0.5) / coeff; ++ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ ++ _mm_store_ss(&r, tmp); ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res.m64_u32[i] = r * (((uint32_t)1) << 31); ++ } ++ } ++ return res; ++} ++ ++float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 ++#define vrsqrteq_f32 _mm_rsqrt_ps ++ ++uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //Input is fixed point number!!! ++ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double ++ _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4]; ++ _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)}; ++ _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000}; ++ __m128 tmp; ++ __m128i res128, mask, zero; ++ float r, resf, coeff; ++ int i,q0, q1, s; ++ _mm_store_si128((__m128i*)atmp, a); ++ zero = _mm_setzero_si128(); ++ for (i =0; i<4; i++){ ++ resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31))); ++ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ ++ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ ++ r = ((float)q0 + 0.5) / coeff; ++ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ ++ _mm_store_ss(&r, tmp); ++ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ ++ r = (float)s / 256.0; ++ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); ++ } ++ res128 = _mm_load_si128((__m128i*)res); ++ mask = _mm_and_si128(a, *(__m128i*)c_c0000000); ++ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff ++ return _mm_or_si128(res128, mask); ++} ++//************ Reciprocal estimate/step and 1/sqrt estimate/step *************************** ++//****************************************************************************************** ++//******VRECPS (Vector Reciprocal Step) *************************************************** ++//multiplies the elements of one vector by the corresponding elements of another vector, ++//subtracts each of the results from 2, and places the final results into the elements of the destination vector. ++ ++float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 ++_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x4_t res; ++ __m64_128 res64; ++ res = vrecpsq_f32(_pM128(a), _pM128(b)); ++ _M64f(res64, res); ++ return res64; ++} ++ ++float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 ++_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0 ++{ ++ __m128 f2, mul; ++ f2 = _mm_set1_ps(2.); ++ mul = _mm_mul_ps(a,b); ++ return _mm_sub_ps(f2,mul); ++} ++ ++//*****************VRSQRTS (Vector Reciprocal Square Root Step) ***************************** ++//multiplies the elements of one vector by the corresponding elements of another vector, ++//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector. ++ ++float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 ++_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2; ++ res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2; ++ return res; ++} ++ ++float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 ++_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0 ++{ ++ __m128 f3, f05, mul; ++ f3 = _mm_set1_ps(3.); ++ f05 = _mm_set1_ps(0.5); ++ mul = _mm_mul_ps(a,b); ++ f3 = _mm_sub_ps(f3,mul); ++ return _mm_mul_ps (f3, f05); ++} ++//******************************************************************************************** ++//***************************** Shifts by signed variable *********************************** ++//******************************************************************************************** ++//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) *********************** ++//******************************************************************************************** ++//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution ++//helper macro. It matches ARM implementation for big shifts ++#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \ ++ else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \ ++ for (i = 0; i= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \ ++ else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \ ++ return res; ++ ++int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(8, i, 8) ++} ++ ++int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(16, i, 4) ++} ++ ++int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(32, i, 2) ++} ++ ++int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(64, i, 1) ++} ++ ++uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(8, u, 8) ++} ++ ++uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(16, u, 4) ++} ++ ++uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT_64(32, u, 2) ++} ++ ++uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers ++{ ++ SERIAL_SHIFT_64(64, u, 1) ++} ++ ++int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int8_t, int8_t, 16, 16) ++} ++ ++int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int16_t, int16_t, 8, 8) ++} ++ ++int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int32_t, int32_t, 4, 4) ++} ++ ++int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(int64_t, int64_t, 2, 2) ++} ++ ++uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint8_t, int8_t, 16, 16) ++} ++ ++uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint16_t, int16_t, 8, 8) ++} ++ ++uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint32_t, int32_t, 4, 4) ++} ++ ++uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SHIFT(uint64_t, int64_t, 2, 2) ++} ++ ++ ++//*********** Vector saturating shift left: (negative values shift right) ********************** ++//******************************************************************************************** ++//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution ++#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ ++ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i]); \ ++ else{ \ ++ if (btmp[i]>lanesize_1) { \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ ++ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ else res[i] = atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ ++ TYPE lanesize = (sizeof(TYPE) << 3); \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i]); \ ++ else{ \ ++ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ ++ else{ \ ++ limit = (TYPE) 1 << (lanesize - btmp[i]); \ ++ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \ ++ int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize_1) { \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ ++ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ ++ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ ++ return res; ++ ++#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ ++ int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ ++ else{ \ ++ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ ++ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \ ++ return res; ++ ++int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(8,8) ++} ++ ++int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(16,4) ++} ++ ++int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(32,2) ++} ++ ++int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED_64(64,1) ++} ++ ++uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8) ++} ++ ++uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4) ++} ++ ++uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2) ++} ++ ++uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1) ++} ++ ++int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16) ++} ++ ++int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8) ++} ++ ++int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4) ++} ++ ++int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2) ++} ++ ++uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16) ++} ++ ++uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8) ++} ++ ++uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4) ++} ++ ++uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2) ++} ++ ++ ++//******** Vector rounding shift left: (negative values shift right) ********** ++//**************************************************************************** ++//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution ++//rounding makes sense for right shifts only. ++#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i= 0) { \ ++ if(btmp[i] >= lanesize) res[i] = 0; \ ++ else res[i] = (atmp[i] << btmp[i]); \ ++ }else{ \ ++ res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \ ++ (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \ ++ (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \ ++ return _mm_load_si128((__m128i*)res); ++ ++ ++#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \ ++ int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \ ++ for (i = 0; i= 0) { \ ++ if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \ ++ else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \ ++ }else{ \ ++ res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \ ++ (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \ ++ (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \ ++ return res; ++ ++ ++int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(8,i,8) ++} ++ ++int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(16,i,4) ++} ++ ++int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(32,i,2) ++} ++ ++int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(64,i,1) ++} ++ ++uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(8,u,8) ++} ++ ++uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(16,u,4) ++} ++ ++uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(32,u,2) ++} ++ ++uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT_64(64,u,1) ++} ++ ++int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16) ++} ++ ++int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8) ++} ++ ++int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4) ++} ++ ++int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2) ++} ++ ++uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16) ++} ++ ++uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8) ++} ++ ++uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4) ++} ++ ++uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2) ++} ++ ++ ++//********** Vector saturating rounding shift left: (negative values shift right) **************** ++//************************************************************************************************* ++//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution ++//Saturation happens for left shifts only while rounding makes sense for right shifts only. ++#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ ++ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ ++ else{ \ ++ if (btmp[i]>lanesize_1) { \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ ++ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ ++ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ ++ else res[i] = atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ ++ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ ++ int lanesize = (sizeof(TYPE) << 3); \ ++ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ ++ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ ++ else{ \ ++ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ ++ else{ \ ++ limit = (TYPE) 1 << (lanesize - btmp[i]); \ ++ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ ++ return _mm_load_si128((__m128i*)res); ++ ++#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \ ++ __m64_128 res; int ## TYPE ## _t limit; int i; \ ++ int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize_1) { \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ ++ }else{ \ ++ limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ ++ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ ++ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ ++ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ ++ return res; ++ ++#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \ ++ __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ ++ int lanesize = (sizeof(int ## TYPE ## _t) << 3); \ ++ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ ++ else{ \ ++ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ ++ else{ \ ++ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ ++ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ ++ return res; ++ ++int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8) ++} ++ ++int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4) ++} ++ ++int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2) ++} ++ ++int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1) ++} ++ ++uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8) ++} ++ ++uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4) ++} ++ ++uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2) ++} ++ ++uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1) ++} ++ ++int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16) ++} ++ ++int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8) ++} ++ ++int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4) ++} ++ ++int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2) ++} ++ ++uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16) ++} ++ ++uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8) ++} ++ ++uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4) ++} ++ ++uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2) ++} ++ ++// ********************************************************************************* ++// ***************************** Shifts by a constant ***************************** ++// ********************************************************************************* ++//**************** Vector shift right by constant************************************* ++//************************************************************************************ ++int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit ++ int8x8_t res64; ++ __m128i r; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_srai_epi16 (r, b); //SSE2 ++ r = _mm_packs_epi16 (r,r); //we need 64 bits only ++ return64(r); ++} ++ ++int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b) ++{ ++ int16x4_t res64; ++ return64(_mm_srai_epi16(_pM128i(a), b)); ++} ++ ++ ++int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ return64(_mm_srai_epi32(_pM128i(a), b)); ++} ++ ++int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //no arithmetic shift for 64bit values, serial solution used ++ int64x1_t res; ++ if(b>=64) res.m64_i64[0] = 0; ++ else res.m64_i64[0] = (*(int64_t*)&a) >> b; ++ return res; ++} ++ ++uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit ++ uint8x8_t res64; ++ __m128i r; ++ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one ++ r = _mm_packus_epi16 (r,r); //we need 64 bits only ++ return64(r); ++} ++ ++uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b) ++{ ++ uint16x4_t res64; ++ return64(_mm_srli_epi16(_pM128i(a), b)); ++} ++ ++ ++uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res64; ++ return64(_mm_srli_epi32(_pM128i(a), b)); ++} ++ ++ ++uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 ++_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b) ++{ ++ uint64x1_t res64; ++ return64(_mm_srli_epi64(_pM128i(a), b)); ++} ++ ++ ++int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit trick ++ __m128i zero, mask0, a_sign, r, a_sign_mask; ++ _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff}; ++ zero = _mm_setzero_si128(); ++ mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift ++ a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0 ++ r = _mm_srai_epi16 (a, b); ++ a_sign_mask = _mm_and_si128 (mask0, a_sign); ++ r = _mm_andnot_si128 (mask0, r); ++ return _mm_or_si128 (r, a_sign_mask); ++} ++ ++int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 ++#define vshrq_n_s16 _mm_srai_epi16 ++ ++int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 ++#define vshrq_n_s32 _mm_srai_epi32 ++ ++int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b) ++{ ++ //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD ++ __m128i c1, signmask,a0, res64; ++ _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000}; ++ c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff ++ signmask = _mm_slli_epi64 (c1, (64 - b)); ++ a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit ++ a0 = _MM_CMPEQ_EPI64 (a, a0); ++ signmask = _mm_and_si128(a0, signmask); ++ res64 = _mm_srli_epi64 (a, b); ++ return _mm_or_si128(res64, signmask); ++} ++ ++uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8 ++{ ++ //no 8 bit shift available, need the special trick ++ __m128i mask0, r; ++ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00}; ++ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift ++ r = _mm_srli_epi16 ( a, b); ++ return _mm_and_si128 (r, mask0); ++} ++ ++uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16 ++#define vshrq_n_u16 _mm_srli_epi16 ++ ++uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 ++#define vshrq_n_u32 _mm_srli_epi32 ++ ++uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 ++#define vshrq_n_u64 _mm_srli_epi64 ++ ++//*************************** Vector shift left by constant ************************* ++//********************************************************************************* ++int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0 ++{ ++ //no 8 bit shift available, go to 16 bit ++ int8x8_t res64; ++ __m128i r; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_slli_epi16 (r, b); //SSE2 ++ r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only ++ return64(r); ++} ++ ++int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b) ++{ ++ int16x4_t res64; ++ return64(_mm_slli_epi16(_pM128i(a), b)); ++} ++ ++ ++int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b) ++{ ++ int32x2_t res64; ++ return64(_mm_slli_epi32(_pM128i(a), b)); ++} ++ ++ ++int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b) ++{ ++ int64x1_t res64; ++ return64(_mm_slli_epi64(_pM128i(a), b)); ++} ++ ++ ++uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 ++_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b) ++{ ++ //no 8 bit shift available, go to 16 bit ++ uint8x8_t res64; ++ __m128i mask8; ++ __m128i r; ++ mask8 = _mm_set1_epi16(0xff); ++ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r = _mm_slli_epi16 (r, b); //SSE2 ++ r = _mm_and_si128(r, mask8); //to avoid saturation ++ r = _mm_packus_epi16 (r,r); //we need 64 bits only ++ return64(r); ++} ++ ++uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 ++#define vshl_n_u16 vshl_n_s16 ++ ++ ++uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 ++#define vshl_n_u32 vshl_n_s32 ++ ++uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 ++#define vshl_n_u64 vshl_n_s64 ++ ++int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++#define vshlq_n_s8 vshlq_n_u8 ++ ++int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++#define vshlq_n_s16 _mm_slli_epi16 ++ ++int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++#define vshlq_n_s32 _mm_slli_epi32 ++ ++int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++#define vshlq_n_s64 _mm_slli_epi64 ++ ++uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 ++_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) ++{ ++ //no 8 bit shift available, need the special trick ++ __m128i mask0, r; ++ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff}; ++ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift ++ r = _mm_slli_epi16 ( a, b); ++ return _mm_and_si128 (r, mask0); ++} ++ ++uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 ++#define vshlq_n_u16 vshlq_n_s16 ++ ++uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 ++#define vshlq_n_u32 vshlq_n_s32 ++ ++uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 ++#define vshlq_n_u64 vshlq_n_s64 ++ ++//************* Vector rounding shift right by constant ****************** ++//************************************************************************* ++//No corresponding x86 intrinsics exist, need to do some tricks ++int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit ++ int8x8_t res64; ++ __m128i r, maskb; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 ++ r = _mm_srai_epi16 (r, b); ++ r = _mm_add_epi16 (r, maskb); //actual rounding ++ r = _mm_packs_epi16 (r,r); ////we need 64 bits only ++ return64(r); ++} ++ ++int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b) ++{ ++ int16x4_t res64; ++ return64(vrshrq_n_s16(_pM128i(a), b)); ++} ++ ++ ++int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ return64(vrshrq_n_s32(_pM128i(a), b)); ++} ++ ++ ++int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ //serial solution is faster ++ int64x1_t res; ++ int64_t a_i64 = *( int64_t*)&a; ++ if(b==64) { ++ res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63; ++ } else { ++ int64_t maskb = a_i64 & (( int64_t)1 << (b - 1)); ++ res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1)); ++ } ++ return res; ++} ++ ++uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one ++ uint8x8_t res64; ++ __m128i r, maskb; ++ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 ++ r = _mm_srli_epi16 (r, b); ++ r = _mm_add_epi16 (r, maskb); //actual rounding ++ r = _mm_packus_epi16 (r,r); ////we need 64 bits only ++ return64(r); ++} ++ ++uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b) ++{ ++ uint16x4_t res64; ++ return64(vrshrq_n_u16(_pM128i(a), b)); ++} ++ ++ ++uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res64; ++ return64(vrshrq_n_u32(_pM128i(a), b)); ++} ++ ++ ++uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 ++_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b) ++{ ++ uint64x1_t res64; ++ return64(vrshrq_n_u64(_pM128i(a), b)); ++} ++ ++int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit trick ++ __m128i r, mask1, maskb; ++ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 ++ r = vshrq_n_s8 (a, b); ++ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding ++ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding ++ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 ++ return _mm_add_epi8(r, maskb); //actual rounding ++} ++ ++int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 ++ r = _mm_srai_epi16 (a, b); ++ return _mm_add_epi16 (r, maskb); //actual rounding ++} ++ ++int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 ++ r = _mm_srai_epi32(a, b); ++ return _mm_add_epi32 (r, maskb); //actual rounding ++} ++ ++int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b) ++{ ++ //solution may be not optimal compared with a serial one ++ __m128i maskb; ++ int64x2_t r; ++ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 ++ r = vshrq_n_s64(a, b); ++ return _mm_add_epi64 (r, maskb); //actual rounding ++} ++ ++uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8 ++{ ++ //no 8 bit shift available, go to 16 bit trick ++ __m128i r, mask1, maskb; ++ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 ++ r = vshrq_n_u8 (a, b); ++ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding ++ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding ++ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 ++ return _mm_add_epi8(r, maskb); //actual rounding ++} ++ ++uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16 ++_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 ++ r = _mm_srli_epi16 (a, b); ++ return _mm_add_epi16 (r, maskb); //actual rounding ++} ++ ++uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 ++_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 ++{ ++ __m128i maskb, r; ++ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 ++ r = _mm_srli_epi32(a, b); ++ return _mm_add_epi32 (r, maskb); //actual rounding ++} ++ ++uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 ++_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b) ++{ ++ //solution may be not optimal compared with a serial one ++ __m128i maskb, r; ++ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit ++ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 ++ r = _mm_srli_epi64(a, b); ++ return _mm_add_epi64 (r, maskb); //actual rounding ++} ++ ++//************* Vector shift right by constant and accumulate ********* ++//********************************************************************* ++int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8 ++{ ++ int8x8_t shift; ++ shift = vshr_n_s8(b, c); ++ return vadd_s8( a, shift); ++} ++ ++int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16 ++{ ++ int16x4_t shift; ++ shift = vshr_n_s16( b, c); ++ return vadd_s16(a, shift); ++} ++ ++int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ int32x2_t shift; ++ shift = vshr_n_s32(b, c); ++ return vadd_s32( a, shift); ++} ++ ++int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 ++_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) ++{ ++ //may be not optimal compared with a serial solution ++ int64x1_t shift; ++ shift = vshr_n_s64(b, c); ++ return vadd_s64( a, shift); ++} ++ ++uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8 ++{ ++ uint8x8_t shift; ++ shift = vshr_n_u8(b, c); ++ return vadd_u8(a, shift); ++} ++ ++uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16 ++{ ++ uint16x4_t shift; ++ shift = vshr_n_u16(b, c); ++ return vadd_u16(a,shift); ++} ++ ++uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ uint32x2_t shift; ++ shift = vshr_n_u32(b, c); ++ return vadd_u32( a, shift); ++} ++ ++uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 ++_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64 ++{ ++ //may be not optimal compared with the serial execution ++ uint64x1_t shift; ++ shift = vshr_n_u64(b, c); ++ return vadd_u64(a, shift); ++} ++ ++int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8 ++{ ++ int8x16_t shift; ++ shift = vshrq_n_s8(b, c); ++ return vaddq_s8(a, shift); ++} ++ ++int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16 ++{ ++ int16x8_t shift; ++ shift = vshrq_n_s16(b, c); ++ return vaddq_s16(a, shift); ++} ++ ++int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32 ++{ ++ int32x4_t shift; ++ shift = vshrq_n_s32(b, c); ++ return vaddq_s32(a, shift); ++} ++ ++int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64 ++{ ++ int64x2_t shift; ++ shift = vshrq_n_s64(b, c); ++ return vaddq_s64( a, shift); ++} ++ ++uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8 ++{ ++ uint8x16_t shift; ++ shift = vshrq_n_u8(b, c); ++ return vaddq_u8(a, shift); ++} ++ ++uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16 ++_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16 ++{ ++ uint16x8_t shift; ++ shift = vshrq_n_u16(b, c); ++ return vaddq_u16(a, shift); ++} ++ ++uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 ++_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32 ++{ ++ uint32x4_t shift; ++ shift = vshrq_n_u32(b, c); ++ return vaddq_u32(a, shift); ++} ++ ++uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 ++_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64 ++{ ++ uint64x2_t shift; ++ shift = vshrq_n_u64(b, c); ++ return vaddq_u64(a, shift); ++} ++ ++//************* Vector rounding shift right by constant and accumulate **************************** ++//************************************************************************************************ ++int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8 ++{ ++ int8x8_t shift; ++ shift = vrshr_n_s8(b, c); ++ return vadd_s8( a, shift); ++} ++ ++int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16 ++{ ++ int16x4_t shift; ++ shift = vrshr_n_s16( b, c); ++ return vadd_s16(a, shift); ++} ++ ++int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ int32x2_t shift; ++ shift = vrshr_n_s32(b, c); ++ return vadd_s32( a, shift); ++} ++ ++int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution ++{ ++ int64x1_t shift; ++ shift = vrshr_n_s64(b, c); ++ return vadd_s64( a, shift); ++} ++ ++uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 ++_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8 ++{ ++ uint8x8_t shift; ++ shift = vrshr_n_u8(b, c); ++ return vadd_u8(a, shift); ++} ++ ++uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16 ++_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16 ++{ ++ uint16x4_t shift; ++ shift = vrshr_n_u16(b, c); ++ return vadd_u16(a,shift); ++} ++ ++uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 ++_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32 ++{ ++ //may be not optimal compared with the serial execution ++ uint32x2_t shift; ++ shift = vrshr_n_u32(b, c); ++ return vadd_u32( a, shift); ++} ++ ++uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution ++{ ++ //may be not optimal compared with the serial execution ++ uint64x1_t shift; ++ shift = vrshr_n_u64(b, c); ++ return vadd_u64( a, shift); ++} ++ ++int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8 ++{ ++ int8x16_t shift; ++ shift = vrshrq_n_s8(b, c); ++ return vaddq_s8(a, shift); ++} ++ ++int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16 ++{ ++ int16x8_t shift; ++ shift = vrshrq_n_s16(b, c); ++ return vaddq_s16(a, shift); ++} ++ ++int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32 ++{ ++ int32x4_t shift; ++ shift = vrshrq_n_s32(b, c); ++ return vaddq_s32(a, shift); ++} ++ ++int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) ++{ ++ int64x2_t shift; ++ shift = vrshrq_n_s64(b, c); ++ return vaddq_s64(a, shift); ++} ++ ++uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 ++_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8 ++{ ++ uint8x16_t shift; ++ shift = vrshrq_n_u8(b, c); ++ return vaddq_u8(a, shift); ++} ++ ++uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16 ++_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16 ++{ ++ uint16x8_t shift; ++ shift = vrshrq_n_u16(b, c); ++ return vaddq_u16(a, shift); ++} ++ ++uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 ++_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32 ++{ ++ uint32x4_t shift; ++ shift = vrshrq_n_u32(b, c); ++ return vaddq_u32(a, shift); ++} ++ ++uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 ++_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) ++{ ++ uint64x2_t shift; ++ shift = vrshrq_n_u64(b, c); ++ return vaddq_u64(a, shift); ++} ++ ++//**********************Vector saturating shift left by constant ***************************** ++//******************************************************************************************** ++//we don't check const ranges assuming they are met ++int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 ++_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0 ++{ ++ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) ++ int8x8_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi16 (a128, b); ++ r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only ++ return64(r128); ++} ++ ++int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 ++_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0 ++{ ++ // go to 32 bit to get the auto saturation (in packs function) ++ int16x4_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi32 (a128, b); //shift_res ++ r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only ++ return64(r128); ++} ++ ++int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b) ++{ ++ //serial execution may be faster ++ int32x2_t res64; ++ return64(vqshlq_n_s32 (_pM128i(a), b)); ++} ++ ++ ++int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ int64x1_t res; ++ int64_t bmask; ++ int64_t a_i64 = *( int64_t*)&a; ++ bmask = ( int64_t)1 << (63 - b); //positive ++ if (a_i64 >= bmask) { ++ res.m64_i64[0] = ~(_SIGNBIT64); ++ } else { ++ res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b; ++ } ++ return res; ++} ++ ++ ++uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 ++_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0 ++{ ++ //no 8 bit shift available in IA32 SIMD, go to 16 bit ++ uint8x8_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi16 (a128, b); //shift_res ++ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only ++ return64(r128); ++} ++ ++uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0 ++_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0 ++{ ++ // go to 32 bit to get the auto saturation (in packus function) ++ uint16x4_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi32 (a128, b); //shift_res ++ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16 ++ return64(r128); ++} ++ ++uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 ++_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b) ++{ ++ uint32x2_t res64; ++ return64(vqshlq_n_u32(_pM128i(a), b)); ++} ++ ++uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ uint64x1_t res; ++ uint64_t bmask; ++ uint64_t a_i64 = *(uint64_t*)&a; ++ bmask = ( uint64_t)1 << (64 - b); ++ res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a ++ return res; ++} ++ ++int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 ++_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0 ++{ ++ // go to 16 bit to get the auto saturation (in packs function) ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi16 (a128, b); ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI8_EPI16 (a128); ++ r128_2 = _mm_slli_epi16 (a128, b); ++ return _mm_packs_epi16 (r128_1, r128_2); //saturated s8 ++} ++ ++int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 ++_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0 ++{ ++ // manual saturation solution looks LESS optimal than 32 bits conversion one ++ // go to 32 bit to get the auto saturation (in packs function) ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi32 (a128, b); //shift_res ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI16_EPI32 (a128); ++ r128_2 = _mm_slli_epi32 (a128, b); ++ return _mm_packs_epi32 (r128_1, r128_2); //saturated s16 ++} ++ ++int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 ++_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0 ++{ ++ // no 64 bit saturation option available, special tricks necessary ++ __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask; ++ c1 = _mm_cmpeq_epi32(a,a); //0xff..ff ++ maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones ++ saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise ++ c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not ++ shift_res = _mm_slli_epi32 (a, b); ++ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); ++ //result with positive numbers saturated ++ shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask); ++ //treat negative numbers ++ maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros ++ saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise ++ c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not ++ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); ++ return _mm_or_si128 (c7ffffff_mask, shift_res_mask); ++} ++ ++int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2]; ++ int64_t bmask; ++ int i; ++ bmask = ( int64_t)1 << (63 - b); //positive ++ _mm_store_si128((__m128i*)atmp, a); ++ for (i = 0; i<2; i++) { ++ if (atmp[i] >= bmask) { ++ res[i] = ~(_SIGNBIT64); ++ } else { ++ res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b; ++ } ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 ++_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0 ++{ ++ // go to 16 bit to get the auto saturation (in packs function) ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi16 (a128, b); ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPU8_EPI16 (a128); ++ r128_2 = _mm_slli_epi16 (a128, b); ++ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 ++} ++ ++uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0 ++_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0 ++{ ++ // manual saturation solution looks more optimal than 32 bits conversion one ++ __m128i cb, c8000, a_signed, saturation_mask, shift_res; ++ cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 ); ++ c8000 = _mm_set1_epi16 (0x8000); ++//no unsigned shorts comparison in SSE, only signed available, so need the trick ++ a_signed = _mm_sub_epi16(a, c8000); //go to signed ++ saturation_mask = _mm_cmpgt_epi16 (a_signed, cb); ++ shift_res = _mm_slli_epi16 (a, b); ++ return _mm_or_si128 (shift_res, saturation_mask); ++} ++ ++uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 ++_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0 ++{ ++ // manual saturation solution, no 64 bit saturation option, the serial version may be faster ++ __m128i cb, c80000000, a_signed, saturation_mask, shift_res; ++ cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 ); ++ c80000000 = _mm_set1_epi32 (0x80000000); ++//no unsigned ints comparison in SSE, only signed available, so need the trick ++ a_signed = _mm_sub_epi32(a, c80000000); //go to signed ++ saturation_mask = _mm_cmpgt_epi32 (a_signed, cb); ++ shift_res = _mm_slli_epi32 (a, b); ++ return _mm_or_si128 (shift_res, saturation_mask); ++} ++ ++uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here ++ _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2]; ++ uint64_t bmask; ++ int i; ++ bmask = ( uint64_t)1 << (64 - b); ++ _mm_store_si128((__m128i*)atmp, a); ++ for (i = 0; i<2; i++) { ++ res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//**************Vector signed->unsigned saturating shift left by constant ************* ++//************************************************************************************* ++uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 ++_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0 ++{ ++ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) ++ uint8x8_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi16 (a128, b); ++ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only ++ return64(r128); ++} ++ ++uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 ++_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0 ++{ ++ uint16x4_t res64; ++ __m128i a128, r128; ++ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 ++ r128 = _mm_slli_epi32 (a128, b); //shift_res ++ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only ++ return64(r128); ++} ++ ++uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b) ++{ ++ int32x2_t res64; ++ return64( vqshluq_n_s32(_pM128i(a), b)); ++} ++ ++uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster ++{ ++ uint64x1_t res; ++ uint64_t limit; ++ if (a.m64_i64[0]<=0) { ++ res.m64_u64[0] = 0; ++ } else { ++ limit = (uint64_t) 1 << (64 - b); ++ res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b; ++ } ++ return res; ++} ++ ++uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 ++_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0 ++{ ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi16 (a128, b); ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI8_EPI16 (a128); ++ r128_2 = _mm_slli_epi16 (a128, b); ++ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 ++} ++ ++uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 ++_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0 ++{ ++ // manual saturation solution looks LESS optimal than 32 bits conversion one ++ __m128i a128, r128_1, r128_2; ++ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 ++ r128_1 = _mm_slli_epi32 (a128, b); //shift_res ++ //swap hi and low part of a128 to process the remaining data ++ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); ++ a128 = _MM_CVTEPI16_EPI32 (a128); ++ r128_2 = _mm_slli_epi32 (a128, b); ++ return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16 ++} ++ ++uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 ++_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0 ++{ ++ //solution may be not optimal compared with the serial one ++ __m128i zero, maskA, maskGT0, a0, a_masked, a_shift; ++ zero = _mm_setzero_si128(); ++ maskA = _mm_cmpeq_epi32(a, a); ++ maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros ++ //saturate negative numbers to zero ++ maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers) ++ a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now ++ //saturate positive to 0xffffffff ++ a_masked = _mm_and_si128 (a0, maskA); ++ a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise ++ a_shift = _mm_slli_epi32 (a0, b); ++ return _mm_or_si128 (a_shift, a_masked); //actual saturation ++} ++ ++uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ // no effective SIMD solution here, serial execution looks faster ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ _NEON2SSE_ALIGN_16 uint64_t res[2]; ++ uint64_t limit; ++ int i; ++ _mm_store_si128((__m128i*)atmp, a); ++ for (i = 0; i<2; i++) { ++ if (atmp[i]<=0) { ++ res[i] = 0; ++ } else { ++ limit = (uint64_t) 1 << (64 - b); ++ res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b; ++ } ++ } ++ return _mm_load_si128((__m128i*)res); ++} ++ ++//************** Vector narrowing shift right by constant ************** ++//********************************************************************** ++int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r16 = vshrq_n_s16(a,b); ++ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r16); ++} ++ ++int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r32 = vshrq_n_s32(a,b); ++ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r32); ++} ++ ++int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ __m128i r64; ++ r64 = vshrq_n_s64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 ++{ ++ uint8x8_t res64; ++ __m128i mask, r16; ++ mask = _mm_set1_epi16(0xff); ++ r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_and_si128(r16, mask); //to avoid saturation ++ r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i mask, r32; ++ mask = _mm_set1_epi32(0xffff); ++ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16) ++ r32 = _mm_and_si128(r32, mask); //to avoid saturation ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res64; ++ __m128i r64; ++ r64 = vshrq_n_u64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++//************** Vector signed->unsigned narrowing saturating shift right by constant ******** ++//********************************************************************************************* ++uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8 ++{ ++ uint8x8_t res64; ++ __m128i r16; ++ r16 = vshrq_n_s16(a,b); ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i r32; ++ r32 = vshrq_n_s32(a,b); ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster ++{ ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ uint32x2_t res; ++ int64_t res64; ++ _mm_store_si128((__m128i*)atmp, a); ++ if (atmp[0] < 0) { ++ res.m64_u32[0] = 0; ++ } else { ++ res64 = (atmp[0] >> b); ++ res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64; ++ } ++ if (atmp[1] < 0) { ++ res.m64_u32[1] = 0; ++ } else { ++ res64 = (atmp[1] >> b); ++ res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64; ++ } ++ return res; ++} ++ ++//**** Vector signed->unsigned rounding narrowing saturating shift right by constant ***** ++uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8 ++{ ++ //solution may be not optimal compared with the serial one ++ __m128i r16; ++ uint8x8_t res64; ++ r16 = vrshrq_n_s16(a,b); ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16 ++{ ++ //solution may be not optimal compared with the serial one ++ __m128i r32; ++ uint16x4_t res64; ++ r32 = vrshrq_n_s32(a,b); ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster ++{ ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ uint32x2_t res; ++ int64_t res64; ++ _mm_store_si128((__m128i*)atmp, a); ++ if (atmp[0] < 0) { ++ res.m64_u32[0] = 0; ++ } else { ++ res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); ++ res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; ++ } ++ if (atmp[1] < 0) { ++ res.m64_u32[1] = 0; ++ } else { ++ res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); ++ res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; ++ } ++ return res; ++} ++ ++//***** Vector narrowing saturating shift right by constant ****** ++//***************************************************************** ++int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ r16 = vshrq_n_s16(a,b); ++ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ r32 = vshrq_n_s32(a,b); ++ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //no optimal SIMD solution found ++ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2]; ++ int32x2_t res; ++ _mm_store_si128((__m128i*)atmp, a); ++ res64[0] = (atmp[0] >> b); ++ res64[1] = (atmp[1] >> b); ++ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; ++ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; ++ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i r32; ++ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) ++{ ++ //serial solution may be faster ++ uint32x2_t res64; ++ __m128i r64, res_hi, zero; ++ zero = _mm_setzero_si128(); ++ r64 = vshrq_n_u64(a,b); ++ res_hi = _mm_srli_epi64(r64, 32); ++ res_hi = _mm_cmpgt_epi32(res_hi, zero); ++ r64 = _mm_or_si128(r64, res_hi); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++ ++//********* Vector rounding narrowing shift right by constant ************************* ++//**************************************************************************************** ++int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r16 = vrshrq_n_s16(a,b); ++ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r16); ++} ++ ++int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ r32 = vrshrq_n_s32(a,b); ++ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems ++ return64(r32); ++} ++ ++int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ __m128i r64; ++ r64 = vrshrq_n_s64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 ++_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 ++{ ++ uint8x8_t res64; ++ __m128i mask, r16; ++ mask = _mm_set1_epi16(0xff); ++ r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_and_si128(r16, mask); //to avoid saturation ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i mask, r32; ++ mask = _mm_set1_epi32(0xffff); ++ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) ++ r32 = _mm_and_si128(r32, mask); //to avoid saturation ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster ++{ ++ uint32x2_t res64; ++ __m128i r64; ++ r64 = vrshrq_n_u64(a,b); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++//************* Vector rounding narrowing saturating shift right by constant ************ ++//**************************************************************************************** ++int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 ++_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8 ++{ ++ int8x8_t res64; ++ __m128i r16; ++ r16 = vrshrq_n_s16(a,b); ++ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 ++_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16 ++{ ++ int16x4_t res64; ++ __m128i r32; ++ r32 = vrshrq_n_s32(a,b); ++ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //no optimal SIMD solution found ++ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2]; ++ int32x2_t res; ++ _mm_store_si128((__m128i*)atmp, a); ++ maskb[0] = atmp[0] & (( int64_t)1 << (b - 1)); ++ res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result ++ maskb[1] = atmp[1] & (( int64_t)1 << (b - 1)); ++ res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result ++ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; ++ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; ++ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) ++ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only ++ return64(r16); ++} ++ ++uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 ++_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16 ++{ ++ uint16x4_t res64; ++ __m128i r32; ++ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) ++ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only ++ return64(r32); ++} ++ ++uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 ++_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) ++{ ++ //serial solution may be faster ++ uint32x2_t res64; ++ __m128i r64, res_hi, zero; ++ zero = _mm_setzero_si128(); ++ r64 = vrshrq_n_u64(a,b); ++ res_hi = _mm_srli_epi64(r64, 32); ++ res_hi = _mm_cmpgt_epi32(res_hi, zero); ++ r64 = _mm_or_si128(r64, res_hi); ++ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(r64); ++} ++ ++//************** Vector widening shift left by constant **************** ++//************************************************************************ ++int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 ++_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0 ++{ ++ __m128i r; ++ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 ++ return _mm_slli_epi16 (r, b); ++} ++ ++int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 ++_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0 ++{ ++ __m128i r; ++ r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1, ++ return _mm_slli_epi32 (r, b); ++} ++ ++int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 ++_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0 ++{ ++ __m128i r; ++ r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1, ++ return _mm_slli_epi64 (r, b); ++} ++ ++uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 ++_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0 ++{ ++ //no uint8 to uint16 conversion available, manual conversion used ++ __m128i zero, r; ++ zero = _mm_setzero_si128 (); ++ r = _mm_unpacklo_epi8(_pM128i(a), zero); ++ return _mm_slli_epi16 (r, b); ++} ++ ++uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0 ++_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0 ++{ ++ //no uint16 to uint32 conversion available, manual conversion used ++ __m128i zero, r; ++ zero = _mm_setzero_si128 (); ++ r = _mm_unpacklo_epi16(_pM128i(a), zero); ++ return _mm_slli_epi32 (r, b); ++} ++ ++uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 ++_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0 ++{ ++ //no uint32 to uint64 conversion available, manual conversion used ++ __m128i zero, r; ++ zero = _mm_setzero_si128 (); ++ r = _mm_unpacklo_epi32(_pM128i(a), zero); ++ return _mm_slli_epi64 (r, b); ++} ++ ++//************************************************************************************ ++//**************************** Shifts with insert ************************************ ++//************************************************************************************ ++//takes each element in a vector, shifts them by an immediate value, ++//and inserts the results in the destination vector. Bits shifted out of the each element are lost. ++ ++//**************** Vector shift right and insert ************************************ ++//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift. ++//All other bits are taken from b shifted. ++int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) ++{ ++ int8x8_t res64; ++ return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) ++{ ++ int16x4_t res64; ++ return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) ++{ ++ int32x2_t res64; ++ return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) ++{ ++ int64x1_t res; ++ if (c ==64) ++ res = a; ++ else{ ++ res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros ++ } ++ return res; ++} ++ ++uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++#define vsri_n_u8 vsri_n_s8 ++ ++uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++#define vsri_n_u16 vsri_n_s16 ++ ++uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 ++#define vsri_n_u32 vsri_n_s32 ++ ++ ++uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 ++#define vsri_n_u64 vsri_n_s64 ++ ++poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 ++#define vsri_n_p8 vsri_n_u8 ++ ++poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 ++#define vsri_n_p16 vsri_n_u16 ++ ++int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8 ++{ ++ __m128i maskA, a_masked; ++ uint8x16_t b_shift; ++ _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used ++ maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros ++ a_masked = _mm_and_si128 (a, maskA); ++ b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift ++ return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a) ++} ++ ++int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16 ++{ ++ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a ++ uint16x8_t b_shift; ++ uint16x8_t a_c; ++ b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift ++ a_c = vshrq_n_u16( a, (16 - c)); ++ a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a ++ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) ++} ++ ++int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32 ++{ ++ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a ++ uint32x4_t b_shift; ++ uint32x4_t a_c; ++ b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift ++ a_c = vshrq_n_u32( a, (32 - c)); ++ a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a ++ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) ++} ++ ++int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) ++{ ++ //serial solution may be faster ++ uint64x2_t b_shift; ++ uint64x2_t a_c; ++ b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift ++ a_c = _mm_srli_epi64(a, (64 - c)); ++ a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a ++ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) ++} ++ ++uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++#define vsriq_n_u8 vsriq_n_s8 ++ ++uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++#define vsriq_n_u16 vsriq_n_s16 ++ ++uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 ++#define vsriq_n_u32 vsriq_n_s32 ++ ++uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 ++#define vsriq_n_u64 vsriq_n_s64 ++ ++poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 ++#define vsriq_n_p8 vsriq_n_u8 ++ ++poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 ++#define vsriq_n_p16 vsriq_n_u16 ++ ++//***** Vector shift left and insert ********************************************* ++//********************************************************************************* ++//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift. ++//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted". ++int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c) ++{ ++ int8x8_t res64; ++ return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c) ++{ ++ int16x4_t res64; ++ return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c)); ++} ++ ++ ++int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c) ++{ ++ int32x2_t res64; ++ return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c)); ++} ++ ++int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros ++ return res; ++} ++ ++ ++uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++#define vsli_n_u8 vsli_n_s8 ++ ++uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++#define vsli_n_u16 vsli_n_s16 ++ ++uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 ++#define vsli_n_u32 vsli_n_s32 ++ ++uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 ++#define vsli_n_u64 vsli_n_s64 ++ ++poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 ++#define vsli_n_p8 vsli_n_u8 ++ ++poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 ++#define vsli_n_p16 vsli_n_u16 ++ ++int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0 ++{ ++ __m128i maskA, a_masked; ++ int8x16_t b_shift; ++ _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask ++ maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones ++ b_shift = vshlq_n_s8( b, c); ++ a_masked = _mm_and_si128 (a, maskA); ++ return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a) ++} ++ ++int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0 ++{ ++ //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a ++ int16x8_t b_shift; ++ int16x8_t a_c; ++ b_shift = vshlq_n_s16( b, c); ++ a_c = vshlq_n_s16( a, (16 - c)); ++ a_c = _mm_srli_epi16(a_c, (16 - c)); ++ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) ++} ++ ++int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0 ++{ ++ //solution may be not optimal compared with the serial one ++ //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a ++ int32x4_t b_shift; ++ int32x4_t a_c; ++ b_shift = vshlq_n_s32( b, c); ++ a_c = vshlq_n_s32( a, (32 - c)); ++ a_c = _mm_srli_epi32(a_c, (32 - c)); ++ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) ++} ++ ++int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0 ++{ ++ //solution may be not optimal compared with the serial one ++ //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a ++ int64x2_t b_shift; ++ int64x2_t a_c; ++ b_shift = vshlq_n_s64( b, c); ++ a_c = vshlq_n_s64( a, (64 - c)); ++ a_c = _mm_srli_epi64(a_c, (64 - c)); ++ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) ++} ++ ++uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++#define vsliq_n_u8 vsliq_n_s8 ++ ++uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++#define vsliq_n_u16 vsliq_n_s16 ++ ++uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 ++#define vsliq_n_u32 vsliq_n_s32 ++ ++uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 ++#define vsliq_n_u64 vsliq_n_s64 ++ ++poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 ++#define vsliq_n_p8 vsliq_n_u8 ++ ++poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 ++#define vsliq_n_p16 vsliq_n_u16 ++ ++// *********************************************************************************************** ++// ****************** Loads and stores of a single vector *************************************** ++// *********************************************************************************************** ++//Performs loads and stores of a single vector of some type. ++//******************************* Loads ******************************************************** ++// *********************************************************************************************** ++//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);. ++//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous. ++// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access ++//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead; ++#define LOAD_SI128(ptr) \ ++ ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)); ++ ++uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++#define vld1q_u8 LOAD_SI128 ++ ++uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++#define vld1q_u16 LOAD_SI128 ++ ++uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++#define vld1q_u32 LOAD_SI128 ++ ++uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld1q_u64 LOAD_SI128 ++ ++int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++#define vld1q_s8 LOAD_SI128 ++ ++int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++#define vld1q_s16 LOAD_SI128 ++ ++int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++#define vld1q_s32 LOAD_SI128 ++ ++int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld1q_s64 LOAD_SI128 ++ ++float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers ++/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0] ++{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); ++__m128 f2; ++f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]); ++}*/ ++ ++float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr) ++{ ++ if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned ++ return _mm_load_ps(ptr); ++ else ++ return _mm_loadu_ps(ptr); ++} ++ ++poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] ++#define vld1q_p8 LOAD_SI128 ++ ++poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] ++#define vld1q_p16 LOAD_SI128 ++ ++uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] ++#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr)) ++ ++uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] ++#define vld1_u16 vld1_u8 ++ ++uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] ++#define vld1_u32 vld1_u8 ++ ++ ++uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1_u64 vld1_u8 ++ ++int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] ++#define vld1_s8 vld1_u8 ++ ++int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] ++#define vld1_s16 vld1_u16 ++ ++int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] ++#define vld1_s32 vld1_u32 ++ ++int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1_s64 vld1_u64 ++ ++float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); ++ ++float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] ++_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = *(ptr); ++ res.m64_f32[1] = *(ptr + 1); ++ return res; ++} ++ ++poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] ++#define vld1_p8 vld1_u8 ++ ++poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] ++#define vld1_p16 vld1_u16 ++ ++//*********************************************************************************************************** ++//******* Lane load functions - insert the data at vector's given position (lane) ************************* ++//*********************************************************************************************************** ++uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) ++ ++uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) ++ ++uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) ++ ++uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] ++#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p; ++ ++ ++int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) ++ ++int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) ++ ++int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) ++ ++float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane) ++{ ++ //we need to deal with ptr 16bit NOT aligned case ++ __m128 p; ++ p = _mm_set1_ps(*(ptr)); ++ return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane)); ++} ++ ++int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] ++#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane) ++ ++poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) ++ ++poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) ++ ++uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] ++_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane) ++{ ++ uint8x8_t res; ++ res = vec; ++ res.m64_u8[lane] = *(ptr); ++ return res; ++} ++ ++uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane) ++{ ++ uint16x4_t res; ++ res = vec; ++ res.m64_u16[lane] = *(ptr); ++ return res; ++} ++ ++uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane) ++{ ++ uint32x2_t res; ++ res = vec; ++ res.m64_u32[lane] = *(ptr); ++ return res; ++} ++ ++uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] ++_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = *(ptr); ++ return res; ++} ++ ++ ++int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane) ++ ++int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane) ++ ++int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane) ++ ++float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane) ++{ ++ float32x2_t res; ++ res = vec; ++ res.m64_f32[lane] = *(ptr); ++ return res; ++} ++ ++int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] ++#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane) ++ ++poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] ++#define vld1_lane_p8 vld1_lane_u8 ++ ++poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] ++#define vld1_lane_p16 vld1_lane_s16 ++ ++// ****************** Load single value ( set all lanes of vector with same value from memory)********************** ++// ****************************************************************************************************************** ++uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr)) ++ ++uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr)) ++ ++uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr)) ++ ++uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)}; ++ return LOAD_SI128(val); ++} ++ ++int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr)) ++ ++int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr)) ++ ++int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr)) ++ ++int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr) ++ ++float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++//current IA SIMD doesn't support float16, need to go to 32 bits ++ ++float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr)) ++ ++poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr)) ++ ++poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr)) ++ ++uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint8x8_t res; ++ int i; ++ for(i = 0; i<8; i++) { ++ res.m64_u8[i] = *(ptr); ++ } ++ return res; ++} ++ ++uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint16x4_t res; ++ int i; ++ for(i = 0; i<4; i++) { ++ res.m64_u16[i] = *(ptr); ++ } ++ return res; ++} ++ ++uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = *(ptr); ++ res.m64_u32[1] = *(ptr); ++ return res; ++} ++ ++uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] ++_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = *(ptr); ++ return res; ++} ++ ++int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr) ++ ++ ++int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr) ++ ++ ++int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr) ++ ++ ++int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] ++#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr) ++ ++float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] ++_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = *(ptr); ++ res.m64_f32[1] = res.m64_f32[0]; ++ return res; // use last 64bits only ++} ++ ++poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] ++#define vld1_dup_p8 vld1_dup_u8 ++ ++ ++poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] ++#define vld1_dup_p16 vld1_dup_u16 ++ ++ ++//************************************************************************************* ++//********************************* Store ********************************************** ++//************************************************************************************* ++// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val); ++//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro ++#define STORE_SI128(ptr, val) \ ++ (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); ++ ++void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] ++#define vst1q_u8 STORE_SI128 ++ ++void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] ++#define vst1q_u16 STORE_SI128 ++ ++void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] ++#define vst1q_u32 STORE_SI128 ++ ++void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] ++#define vst1q_u64 STORE_SI128 ++ ++void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] ++#define vst1q_s8 STORE_SI128 ++ ++void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] ++#define vst1q_s16 STORE_SI128 ++ ++void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] ++#define vst1q_s32 STORE_SI128 ++ ++void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] ++#define vst1q_s64 STORE_SI128 ++ ++void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val) ++{ ++ if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned ++ _mm_store_ps (ptr, val); ++ else ++ _mm_storeu_ps (ptr, val); ++} ++ ++void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] ++#define vst1q_p8 vst1q_u8 ++ ++void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] ++#define vst1q_p16 vst1q_u16 ++ ++void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val) ++{ ++ int i; ++ for (i = 0; i<8; i++) { ++ *(ptr + i) = ((uint8_t*)&val)[i]; ++ } ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val) ++{ ++ int i; ++ for (i = 0; i<4; i++) { ++ *(ptr + i) = ((uint16_t*)&val)[i]; ++ } ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val) ++{ ++ int i; ++ for (i = 0; i<2; i++) { ++ *(ptr + i) = ((uint32_t*)&val)[i]; ++ } ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val) ++{ ++ *(ptr) = *((uint64_t*)&val); ++ //_mm_storel_epi64((__m128i*)ptr, val); ++ return; ++} ++ ++void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] ++#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val) ++ ++void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] ++#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val) ++ ++void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] ++#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val) ++ ++void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] ++#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val) ++ ++void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val) ++{ ++ *(ptr) = val.m64_f32[0]; ++ *(ptr + 1) = val.m64_f32[1]; ++ return; ++} ++ ++void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] ++#define vst1_p8 vst1_u8 ++ ++void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] ++#define vst1_p16 vst1_u16 ++ ++//***********Store a lane of a vector into memory (extract given lane) ********************* ++//****************************************************************************************** ++void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) ++ ++void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) ++ ++void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] ++#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) ++ ++void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] ++#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) ++ ++void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) ++ ++void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) ++ ++void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] ++#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) ++ ++void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] ++#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) ++ ++void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane) ++{ ++ int32_t ilane; ++ ilane = _MM_EXTRACT_PS(val,lane); ++ *(ptr) = *((float*)&ilane); ++} ++ ++void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1q_lane_p8 vst1q_lane_u8 ++ ++void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1q_lane_p16 vst1q_lane_s16 ++ ++void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane) ++{ ++ *(ptr) = val.m64_u8[lane]; ++} ++ ++void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val.m64_u16[lane]; ++} ++ ++void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val.m64_u32[lane]; ++} ++ ++void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] ++_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane) ++{ ++ *(ptr) = val.m64_u64[0]; ++} ++ ++void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane) ++ ++void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane) ++ ++void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] ++#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane) ++ ++ ++void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] ++#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane) ++ ++ ++void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] ++_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val.m64_f32[lane]; ++} ++ ++void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] ++#define vst1_lane_p8 vst1_lane_u8 ++ ++void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] ++#define vst1_lane_p16 vst1_lane_s16 ++ ++//*********************************************************************************************** ++//**************** Loads and stores of an N-element structure ********************************** ++//*********************************************************************************************** ++//These intrinsics load or store an n-element structure. The array structures are defined in the beginning ++//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions" ++//****************** 2 elements load ********************************************* ++uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0] ++{ ++ uint8x16x2_t v; ++ v.val[0] = vld1q_u8(ptr); ++ v.val[1] = vld1q_u8((ptr + 16)); ++ v = vuzpq_s8(v.val[0], v.val[1]); ++ return v; ++} ++ ++uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0] ++{ ++ uint16x8x2_t v; ++ v.val[0] = vld1q_u16( ptr); ++ v.val[1] = vld1q_u16( (ptr + 8)); ++ v = vuzpq_s16(v.val[0], v.val[1]); ++ return v; ++} ++ ++uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0] ++{ ++ uint32x4x2_t v; ++ v.val[0] = vld1q_u32 ( ptr); ++ v.val[1] = vld1q_u32 ( (ptr + 4)); ++ v = vuzpq_s32(v.val[0], v.val[1]); ++ return v; ++} ++ ++int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); ++#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr) ++ ++int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr) ++ ++int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr) ++ ++ ++float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0] ++{ ++ float32x4x2_t v; ++ v.val[0] = vld1q_f32 (ptr); ++ v.val[1] = vld1q_f32 ((ptr + 4)); ++ v = vuzpq_f32(v.val[0], v.val[1]); ++ return v; ++} ++ ++poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] ++#define vld2q_p8 vld2q_u8 ++ ++poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] ++#define vld2q_p16 vld2q_u16 ++ ++uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr) ++{ ++ uint8x8x2_t v; ++ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; ++ __m128i ld128; ++ ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit ++ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd); ++ vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); ++ return v; ++} ++ ++uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr) ++{ ++ _NEON2SSE_ALIGN_16 uint16x4x2_t v; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; ++ __m128i ld128; ++ ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit ++ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd); ++ vst1q_u16((v.val), ld128); ++ return v; ++} ++ ++uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr) ++{ ++ _NEON2SSE_ALIGN_16 uint32x2x2_t v; ++ __m128i ld128; ++ ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit ++ ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); ++ vst1q_u32((v.val), ld128); ++ return v; ++} ++ ++uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr) ++{ ++ uint64x1x2_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ return v; ++} ++ ++int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr) ++ ++int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr) ++ ++int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr) ++ ++int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr) ++ ++float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example ++ ++float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr) ++{ ++ float32x2x2_t v; ++ v.val[0].m64_f32[0] = *(ptr); ++ v.val[0].m64_f32[1] = *(ptr + 2); ++ v.val[1].m64_f32[0] = *(ptr + 1); ++ v.val[1].m64_f32[1] = *(ptr + 3); ++ return v; ++} ++ ++poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] ++#define vld2_p8 vld2_u8 ++ ++poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] ++#define vld2_p16 vld2_u16 ++ ++//******************** Triplets *************************************** ++//********************************************************************* ++uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0] ++{ ++ //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 -> ++ //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13 ++ //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14, ++ //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15 ++ uint8x16x3_t v; ++ __m128i tmp0, tmp1,tmp2, tmp3; ++ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14}; ++ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13}; ++ _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15}; ++ ++ v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15 ++ v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15 ++ v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15 ++ ++ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11 ++ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13 ++ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15 ++ ++ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15 ++ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x ++ tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, ++ tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 ++ v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, ++ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13, ++ ++ tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, ++ tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 ++ v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 ++ v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13, ++ v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, ++ v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 ++ tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 ++ tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14, ++ ++ tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, ++ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, ++ v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 ++ v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 ++ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, ++ tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, ++ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15, ++ return v; ++} ++ ++uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0] ++{ ++ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 ++ uint16x8x3_t v; ++ __m128i tmp0, tmp1,tmp2, tmp3; ++ _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; ++ _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13}; ++ _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15}; ++ ++ v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7, ++ v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7 ++ v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7 ++ ++ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5, ++ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6 ++ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7 ++ ++ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6, ++ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x ++ tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7 ++ tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0 ++ v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5, ++ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5 ++ ++ tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7 ++ tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0 ++ v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0 ++ v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6, ++ v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5, ++ v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0, ++ tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0 ++ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6, ++ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6, ++ ++ tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0 ++ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7, ++ v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0 ++ v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0 ++ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7, ++ tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0 ++ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7, ++ return v; ++} ++ ++uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, ++ uint32x4x3_t v; ++ __m128i tmp0, tmp1,tmp2, tmp3; ++ v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3, ++ v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3 ++ v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3, ++ ++ tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2 ++ tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1 ++ tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3 ++ ++ tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2 ++ v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1 ++ tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1 ++ v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0, ++ v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2 ++ v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3 ++ return v; ++} ++ ++int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr)) ++ ++int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr)) ++ ++int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr)) ++ ++float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, ++ float32x4x3_t v; ++ __m128 tmp0, tmp1,tmp2, tmp3; ++ v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3, ++ v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3 ++ v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3, ++ ++ tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2 ++ tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1 ++ tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3 ++ tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2 ++ ++ v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1 ++ tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1 ++ v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0, ++ v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2 ++ v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3 ++ return v; ++} ++ ++poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] ++#define vld3q_p8 vld3q_u8 ++ ++poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] ++#define vld3q_p16 vld3q_u16 ++ ++uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0] ++{ ++ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 ++ uint8x8x3_t v; ++ __m128i val0, val1, val2, tmp0, tmp1; ++ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14}; ++ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0}; ++ val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7 ++ val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7 ++ ++ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6, ++ tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x ++ val0 = _mm_slli_si128(tmp0,10); ++ val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0 ++ val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x ++ val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x ++ _M64(v.val[0], val0); ++ val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5, ++ val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0, ++ val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0 ++ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0 ++ val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x ++ _M64(v.val[1], val1); ++ ++ tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0, ++ val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0 ++ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7, ++ val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0] ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, ++ uint16x4x3_t v; ++ __m128i val0, val1, val2, tmp0, tmp1; ++ _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; ++ val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3 ++ val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x ++ ++ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1 ++ tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3, ++ val0 = _mm_slli_si128(tmp0,10); ++ val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0, ++ val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1 ++ val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0 ++ val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x ++ _M64(v.val[0], val0); ++ ++ val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3 ++ val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0, ++ val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0, ++ val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0 ++ val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x ++ _M64(v.val[1], val1); ++ ++ tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0 ++ tmp1 = _mm_srli_si128(tmp1,4); ++ tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3, ++ val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3, ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0] ++{ ++ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 ++ uint32x2x3_t v; ++ __m128i val0, val1, val2; ++ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, ++ val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x ++ ++ val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0 ++ _M64(v.val[0], val0); ++ val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1, ++ val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1 ++ _M64(v.val[1], val1); ++ val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x, ++ _M64(v.val[2], val2); ++ return v; ++} ++uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] ++{ ++ uint64x1x3_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ v.val[2].m64_u64[0] = *(ptr + 2); ++ return v; ++} ++ ++int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr) ++ ++int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr) ++ ++int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr) ++ ++int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr) ++ ++float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr) ++{ ++ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 ++ float32x2x3_t v; ++ v.val[0].m64_f32[0] = *(ptr); ++ v.val[0].m64_f32[1] = *(ptr + 3); ++ ++ v.val[1].m64_f32[0] = *(ptr + 1); ++ v.val[1].m64_f32[1] = *(ptr + 4); ++ ++ v.val[2].m64_f32[0] = *(ptr + 2); ++ v.val[2].m64_f32[1] = *(ptr + 5); ++ return v; ++} ++ ++poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] ++#define vld3_p8 vld3_u8 ++ ++poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] ++#define vld3_p16 vld3_u16 ++ ++//*************** Quadruples load ******************************** ++//***************************************************************** ++uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0] ++{ ++ uint8x16x4_t v; ++ __m128i tmp3, tmp2, tmp1, tmp0; ++ ++ v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15 ++ v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15 ++ v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15 ++ v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15 ++ ++ tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7 ++ tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7 ++ tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15 ++ tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15 ++ ++ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11 ++ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15 ++ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11 ++ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15 ++ ++ tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9 ++ tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11 ++ tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13, ++ tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15 ++ ++ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12 ++ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13 ++ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14 ++ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15 ++ return v; ++} ++ ++uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0] ++{ ++ uint16x8x4_t v; ++ __m128i tmp3, tmp2, tmp1, tmp0; ++ tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7 ++ tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7 ++ tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7 ++ tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7 ++ v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3, ++ v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3, ++ v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7 ++ v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7 ++ tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5 ++ tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7 ++ tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5 ++ tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7 ++ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4, ++ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5 ++ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6, ++ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7 ++ return v; ++} ++ ++uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] ++{ ++ uint32x4x4_t v; ++ __m128i tmp3, tmp2, tmp1, tmp0; ++ v.val[0] = vld1q_u32 (ptr); ++ v.val[1] = vld1q_u32 ((ptr + 4)); ++ v.val[2] = vld1q_u32 ((ptr + 8)); ++ v.val[3] = vld1q_u32 ((ptr + 12)); ++ tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]); ++ tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]); ++ tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]); ++ tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]); ++ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1); ++ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1); ++ v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3); ++ v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3); ++ return v; ++} ++ ++int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr) ++ ++int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr) ++ ++int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr) ++ ++float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] ++{ ++ float32x4x4_t v; ++ __m128 tmp3, tmp2, tmp1, tmp0; ++ ++ v.val[0] = vld1q_f32 ((float*) ptr); ++ v.val[1] = vld1q_f32 ((float*) (ptr + 4)); ++ v.val[2] = vld1q_f32 ((float*) (ptr + 8)); ++ v.val[3] = vld1q_f32 ((float*) (ptr + 12)); ++ tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]); ++ tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]); ++ tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]); ++ tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]); ++ v.val[0] = _mm_movelh_ps(tmp0, tmp2); ++ v.val[1] = _mm_movehl_ps(tmp2, tmp0); ++ v.val[2] = _mm_movelh_ps(tmp1, tmp3); ++ v.val[3] = _mm_movehl_ps(tmp3, tmp1); ++ return v; ++} ++ ++poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] ++#define vld4q_p8 vld4q_u8 ++ ++poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] ++#define vld4q_p16 vld4q_s16 ++ ++uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0] ++{ ++ uint8x8x4_t v; ++ __m128i sh0, sh1; ++ __m128i val0, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; ++ ++ val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1] ++ val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3] ++ ++ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8); ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8); ++ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29 ++ vst1q_u8(&v.val[0], val0 ); ++ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31 ++ vst1q_u8(&v.val[2], val2 ); ++ return v; ++} ++ ++uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0] ++{ ++ uint16x4x4_t v; ++ __m128i sh0, sh1; ++ __m128i val0, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7 ++ val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1] ++ val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3] ++ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16); ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16); ++ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13 ++ vst1q_u16(&v.val[0], val0 ); ++ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15 ++ vst1q_u16(&v.val[2], val2 ); ++ return v; ++} ++ ++uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr) ++{ ++ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 ++ uint32x2x4_t v; ++ __m128i val0, val01, val2; ++ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, ++ val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1 ++ val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1, ++ val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1 ++ vst1q_u32(&v.val[0], val01); ++ vst1q_u32(&v.val[2], val2 ); ++ return v; ++} ++ ++uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] ++{ ++ uint64x1x4_t v; ++ v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1] ++ v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1] ++ v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3] ++ v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3] ++ return v; ++} ++ ++int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr) ++ ++int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr) ++ ++int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr) ++ ++int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr) ++ ++float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0] ++{ ++ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 ++ float32x2x4_t res; ++ res.val[0].m64_f32[0] = *(ptr); ++ res.val[0].m64_f32[1] = *(ptr + 4); ++ res.val[1].m64_f32[0] = *(ptr + 1); ++ res.val[1].m64_f32[1] = *(ptr + 5); ++ res.val[2].m64_f32[0] = *(ptr + 2); ++ res.val[2].m64_f32[1] = *(ptr + 6); ++ res.val[3].m64_f32[0] = *(ptr + 3); ++ res.val[3].m64_f32[1] = *(ptr + 7); ++ return res; ++} ++ ++poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] ++#define vld4_p8 vld4_u8 ++ ++poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] ++#define vld4_p16 vld4_u16 ++ ++//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes ******************* ++//******************************************************************************************************************* ++uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0] ++{ ++ uint8x8x2_t v; ++ __m128i val0, val1; ++ val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x ++ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x, ++ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x ++ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, ++ vst1q_u8(v.val, val0); ++ return v; ++} ++ ++uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0] ++{ ++ uint16x4x2_t v; ++ __m128i val0, val1; ++ val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x ++ val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0) ++ _M64(v.val[0], val0); ++ val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1) ++ _M64(v.val[1], val1); ++ return v; ++} ++ ++uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] ++{ ++ uint32x2x2_t v; ++ __m128i val0; ++ val0 = LOAD_SI128(ptr); //0,1,x,x ++ val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1 ++ vst1q_u32(v.val, val0); ++ return v; ++} ++ ++uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld2_dup_u64 vld2_u64 ++ ++int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr) ++ ++int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr) ++ ++int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr) ++ ++int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] ++#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr) ++ ++float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] ++_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] ++{ ++ float32x2x2_t v; ++ v.val[0].m64_f32[0] = *(ptr); //0,0 ++ v.val[0].m64_f32[1] = *(ptr); //0,0 ++ v.val[1].m64_f32[0] = *(ptr + 1); //1,1 ++ v.val[1].m64_f32[1] = *(ptr + 1); //1,1 ++ return v; ++} ++ ++poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] ++#define vld2_dup_p8 vld2_dup_u8 ++ ++poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] ++#define vld2_dup_p16 vld2_dup_s16 ++ ++//************* Duplicate (or propagate)triplets: ******************* ++//******************************************************************** ++//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes ++uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0] ++{ ++ uint8x8x3_t v; ++ __m128i val0, val1, val2; ++ val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x ++ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x, ++ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x, ++ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, ++ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x, ++ vst1q_u8(v.val, val0); ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0] ++{ ++ uint16x4x3_t v; ++ __m128i val0, val1, val2; ++ val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x ++ val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0) ++ val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1) ++ val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2) ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] ++{ ++ uint32x2x3_t v; ++ __m128i val0, val1, val2; ++ val2 = LOAD_SI128(ptr); //0,1,2,x ++ val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2 ++ val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2 ++ val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0 ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ return v; ++} ++ ++uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] ++{ ++ uint64x1x3_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ v.val[2].m64_u64[0] = *(ptr + 2); ++ return v; ++} ++ ++int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr) ++ ++int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr) ++ ++int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr) ++ ++int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] ++#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr) ++ ++ ++float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] ++_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] ++{ ++ float32x2x3_t v; ++ int i; ++ for (i = 0; i<3; i++) { ++ v.val[i].m64_f32[0] = *(ptr + i); ++ v.val[i].m64_f32[1] = *(ptr + i); ++ } ++ return v; ++} ++ ++poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_p8 vld3_dup_u8 ++ ++poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] ++#define vld3_dup_p16 vld3_dup_s16 ++ ++ ++//************* Duplicate (or propagate) quadruples: ******************* ++//*********************************************************************** ++//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes ++uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ uint8x8x4_t v; ++ __m128i val0, val1, val2; ++ val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x ++ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x, ++ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3 ++ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, ++ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3 ++ vst1q_u8(&v.val[0], val0); ++ vst1q_u8(&v.val[2], val2); ++ return v; ++} ++ ++uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ uint16x4x4_t v; ++ __m128i val0, val1, val2, val3; ++ val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x ++ val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0) ++ val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1) ++ val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2) ++ val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3) ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ _M64(v.val[3], val3); ++ return v; ++} ++ ++uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ uint32x2x4_t v; ++ __m128i val0, val1, val2, val3; ++ val3 = LOAD_SI128(ptr); //0,1,2,3 ++ val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3 ++ val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3 ++ val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3 ++ val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2 ++ _M64(v.val[0], val0); ++ _M64(v.val[1], val1); ++ _M64(v.val[2], val2); ++ _M64(v.val[3], val3); ++ return v; ++} ++ ++uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] ++{ ++ uint64x1x4_t v; ++ v.val[0].m64_u64[0] = *(ptr); ++ v.val[1].m64_u64[0] = *(ptr + 1); ++ v.val[2].m64_u64[0] = *(ptr + 2); ++ v.val[3].m64_u64[0] = *(ptr + 3); ++ return v; ++} ++ ++int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr) ++ ++int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr) ++ ++int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr) ++ ++int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] ++#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr) ++ ++float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] ++{ ++ float32x2x4_t v; ++ int i; ++ for (i = 0; i<4; i++) { ++ v.val[i].m64_f32[0] = *(ptr + i); ++ v.val[i].m64_f32[1] = *(ptr + i); ++ } ++ return v; ++} ++ ++poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_p8 vld4_dup_u8 ++ ++poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] ++#define vld4_dup_p16 vld4_dup_u16 ++ ++ ++//********************************************************************************** ++//*******************Lane loads for an N-element structures *********************** ++//********************************************************************************** ++//********************** Lane pairs ************************************************ ++//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon ++//we assume src is 16 bit aligned ++ ++//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error ++//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined ++ ++//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0] ++{ ++ uint16x8x2_t v; ++ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); ++ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] ++{ ++ uint32x4x2_t v; ++ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); ++ return v; ++} ++#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane) ++ ++//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane) ++{ ++ int16x8x2_t v; ++ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); ++ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane) ++ ++//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane) ++{ ++ int32x4x2_t v; ++ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); ++ return v; ++} ++#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane) ++ ++//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] ++{ ++ float32x4x2_t v; ++ v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane) ++ ++//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] ++#define vld2q_lane_p16 vld2q_lane_u16 ++ ++//uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0] ++{ ++ uint8x8x2_t v; ++ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane) ++ ++//uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane) ++{ ++ uint16x4x2_t v; ++ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane) ++{ ++ uint32x2x2_t v; ++ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane) ++ ++//int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] ++int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] ++#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane) ++ ++//int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0] ++int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane) ++ ++//int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0] ++int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane) ++ ++//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane) ++{ ++ float32x2x2_t v; ++ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); ++ return v; ++} ++#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane) ++ ++//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] ++poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] ++#define vld2_lane_p8 vld2_lane_u8 ++ ++//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] ++poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] ++#define vld2_lane_p16 vld2_lane_u16 ++ ++//*********** Lane triplets ********************** ++//************************************************* ++//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon ++//we assume src is 16 bit aligned ++ ++//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ uint16x8x3_t v; ++ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ uint32x4x3_t v; ++ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane) ++ ++//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ int16x8x3_t v; ++ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane) ++ ++//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ int32x4x3_t v; ++ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); ++ return v; ++} ++#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane) ++ ++float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++//current IA SIMD doesn't support float16 ++#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane) ++ ++ ++//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] ++{ ++ float32x4x3_t v; ++ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); ++ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); ++ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); ++ return v; ++} ++#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane) ++ ++poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] ++#define vld3q_lane_p16 vld3q_lane_u16 ++ ++//uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ uint8x8x3_t v; ++ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); ++ return v; ++} ++#define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane) ++ ++//uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ uint16x4x3_t v; ++ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); ++ return v; ++} ++#define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ //need to merge into 128 bit anyway ++ uint32x2x3_t v; ++ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);; ++ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);; ++ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);; ++ return v; ++} ++#define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane) ++ ++int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane) ++ ++int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane) ++ ++int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane) ++ ++float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] ++{ ++ float32x2x3_t v; ++ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); ++ return v; ++} ++#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane) ++ ++//poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_p8 vld3_lane_u8 ++ ++//poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] ++#define vld3_lane_p16 vld3_lane_u16 ++ ++//******************* Lane Quadruples load *************************** ++//********************************************************************* ++//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon ++//we assume src is 16 bit aligned ++ ++//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane) ++{ ++ uint16x8x4_t v; ++ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); ++ v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane); ++ return v; ++} ++#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane) ++{ ++ uint32x4x4_t v; ++ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); ++ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); ++ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); ++ v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane); ++ return v; ++} ++#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane) ++ ++//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane) ++ ++//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane) ++ ++//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane) ++{ ++ float32x4x4_t v; ++ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); ++ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); ++ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); ++ v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane); ++ return v; ++} ++#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane) ++ ++//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++#define vld4q_lane_p16 vld4q_lane_u16 ++ ++//uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane) ++{ ++ uint8x8x4_t v; ++ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane) ++ ++//uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane) ++{ ++ uint16x4x4_t v; ++ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane) ++ ++//uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane) ++{ ++ uint32x2x4_t v; ++ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane) ++ ++//int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); ++#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane) ++ ++//int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); ++#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane) ++ ++//int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); ++#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane) ++ ++//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); ++//current IA SIMD doesn't support float16 ++ ++//float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane) ++{ ++ //serial solution may be faster ++ float32x2x4_t v; ++ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); ++ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); ++ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); ++ v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane); ++ return v; ++} ++#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane) ++ ++//poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); ++#define vld4_lane_p8 vld4_lane_u8 ++ ++//poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); ++#define vld4_lane_p16 vld4_lane_u16 ++ ++//******************* Store duplets ********************************************* ++//******************************************************************************** ++//here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function ++//If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions ++//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val) ++{ ++ uint8x16x2_t v; ++ v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]); ++ vst1q_u8 (ptr, v.val[0]); ++ vst1q_u8 ((ptr + 16), v.val[1]); ++} ++#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val) ++ ++//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val) ++{ ++ uint16x8x2_t v; ++ v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]); ++ vst1q_u16 (ptr, v.val[0]); ++ vst1q_u16 ((ptr + 8), v.val[1]); ++} ++#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val) ++ ++//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val) ++{ ++ uint32x4x2_t v; ++ v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]); ++ vst1q_u32 (ptr, v.val[0]); ++ vst1q_u32 ((ptr + 4), v.val[1]); ++} ++#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val) ++ ++//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0] ++void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); ++#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val) ++ ++//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0] ++void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); ++#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val) ++ ++//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0] ++void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); ++#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val) ++ ++//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0] ++void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0] ++_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val) ++{ ++ float32x4x2_t v; ++ v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]); ++ v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]); ++ vst1q_f32 (ptr, v.val[0]); ++ vst1q_f32 ((ptr + 4), v.val[1]); ++} ++#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val) ++ ++//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0] ++void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); ++#define vst2q_p8 vst2q_u8 ++ ++//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0] ++void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); ++#define vst2q_p16 vst2q_u16 ++ ++//void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val) ++{ ++ __m128i v0; ++ v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ vst1q_u8 (ptr, v0); ++} ++#define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val) ++ ++//void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val) ++{ ++ __m128i v0; ++ v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ vst1q_u16 (ptr, v0); ++} ++#define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val) ++ ++//void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val) ++{ ++ __m128i v0; ++ v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ vst1q_u32 (ptr, v0); ++} ++#define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val) ++ ++ ++//void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0] ++void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); ++_NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val) ++{ ++ *(ptr) = val->val[0].m64_u64[0]; ++ *(ptr + 1) = val->val[1].m64_u64[0]; ++} ++#define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val) ++ ++//void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0] ++#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val) ++ ++//void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0] ++#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val) ++ ++//void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0] ++#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val) ++ ++//void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); ++#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val) ++ ++//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0] ++//current IA SIMD doesn't support float16 ++ ++//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0] ++_NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val) ++{ ++ *(ptr) = val->val[0].m64_f32[0]; ++ *(ptr + 1) = val->val[1].m64_f32[0]; ++ *(ptr + 2) = val->val[0].m64_f32[1]; ++ *(ptr + 3) = val->val[1].m64_f32[1]; ++} ++#define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val) ++ ++//void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] ++#define vst2_p8 vst2_u8 ++ ++//void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] ++#define vst2_p16 vst2_u16 ++ ++//******************** Triplets store ***************************************** ++//****************************************************************************** ++//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val) ++{ ++ uint8x16x3_t v; ++ __m128i v0,v1,v2, cff, bldmask; ++ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10}; ++ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15}; ++ ++ v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22 ++ v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46 ++ v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34 ++ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding ++ cff = _mm_cmpeq_epi8(v0, v0); //all ff ++ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff); ++ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); ++ vst1q_u8(ptr, v.val[0]); ++ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff); ++ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); ++ vst1q_u8((ptr + 16), v.val[1]); ++ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff); ++ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); ++ vst1q_u8((ptr + 32), v.val[2]); ++} ++#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val) ++ ++//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val) ++{ ++ uint16x8x3_t v; ++ __m128i v0,v1,v2, cff, bldmask; ++ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11}; ++ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff}; ++ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15}; ++ ++ v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10 ++ v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22, ++ v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19 ++ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding ++ cff = _mm_cmpeq_epi16(v0, v0); //all ff ++ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff); ++ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); ++ vst1q_u16(ptr, v.val[0]); ++ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff); ++ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); ++ vst1q_u16((ptr + 8), v.val[1]); ++ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding ++ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding ++ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff); ++ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); ++ vst1q_u16((ptr + 16), v.val[2]); ++} ++#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val) ++ ++//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val) ++{ ++ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3 ++ uint32x4x3_t v; ++ __m128i tmp0, tmp1,tmp2; ++ tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1 ++ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3 ++ tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1 ++ v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2, ++ v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3 ++ v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3 ++ tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1 ++ v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1, ++ ++ vst1q_u32(ptr, v.val[0]); ++ vst1q_u32((ptr + 4), v.val[1]); ++ vst1q_u32((ptr + 8), v.val[2]); ++} ++#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val) ++ ++//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val); ++void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); ++#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val) ++ ++//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val); ++void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); ++#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val) ++ ++//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val); ++void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); ++#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val) ++ ++//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] ++void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] ++_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val) ++{ ++ float32x4x3_t v; ++ __m128 tmp0, tmp1,tmp2; ++ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1 ++ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3 ++ tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1 ++ v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2, ++ v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3 ++ v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3 ++ tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1 ++ v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1, ++ ++ vst1q_f32( ptr, v.val[0]); ++ vst1q_f32( (ptr + 4), v.val[1]); ++ vst1q_f32( (ptr + 8), v.val[2]); ++} ++#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val) ++ ++//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0] ++void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); ++#define vst3q_p8 vst3q_u8 ++ ++//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] ++void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); ++#define vst3q_p16 vst3q_u16 ++ ++//void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val) ++{ ++ __m128i tmp, sh0, sh1, val0, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5}; ++ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0}; ++ _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0}; ++ _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0}; ++ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) ); ++ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15) ++ val2 = _pM128i(val->val[2]); ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); ++ val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel); ++ vst1q_u8(ptr, val0); //store as 128 bit structure ++ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15) ++ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); ++ val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel); ++ _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory ++} ++#define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val) ++ ++//void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val) ++{ ++ __m128i tmp, val0, val1, val2; ++ _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13}; ++ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0}; ++ _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1] ++ _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0] ++ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1])); ++ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); ++ val2 = _pM128i(val->val[2]); ++ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); ++ val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f); ++ vst1q_u16(ptr, val0); //store as 128 bit structure ++ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); ++ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); ++ val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order ++ _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory ++} ++#define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val) ++ ++//void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val) ++{ ++ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; ++ __m128i val0, val1; ++ val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5 ++ val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5 ++ val1 = _mm_srli_si128(val0, 8); //4,5, x,x ++ _M64((*(__m64_128*)(ptr + 4)), val1); ++ val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2 ++ val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3 ++ vst1q_u32(ptr, val0); //store as 128 bit structure ++} ++#define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val) ++ ++//void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val) ++{ ++ *(ptr) = val->val[0].m64_u64[0]; ++ *(ptr + 1) = val->val[1].m64_u64[0]; ++ *(ptr + 2) = val->val[2].m64_u64[0]; ++} ++#define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val) ++ ++//void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0] ++#define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val) ++ ++//void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0] ++#define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val) ++ ++//void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] ++#define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val) ++ ++//void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0] ++#define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val) ++ ++//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] ++void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++//void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] ++_NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val) ++{ ++ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5 ++ *(ptr) = val->val[0].m64_f32[0]; ++ *(ptr + 1) = val->val[1].m64_f32[0]; ++ *(ptr + 2) = val->val[2].m64_f32[0]; ++ *(ptr + 3) = val->val[0].m64_f32[1]; ++ *(ptr + 4) = val->val[1].m64_f32[1]; ++ *(ptr + 5) = val->val[2].m64_f32[1]; ++} ++#define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val) ++ ++//void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0] ++void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); ++#define vst3_p8 vst3_u8 ++ ++//void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] ++void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); ++#define vst3_p16 vst3_s16 ++ ++//*************** Quadruples store ******************************** ++//********************************************************************* ++//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val) ++{ ++ __m128i tmp1, tmp2, res; ++ tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 ++ tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 ++ res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15 ++ vst1q_u8(ptr, res); ++ res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31 ++ vst1q_u8((ptr + 16), res); ++ tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); // ++ tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); // ++ res = _mm_unpacklo_epi16(tmp1, tmp2); // ++ vst1q_u8((ptr + 32), res); ++ res = _mm_unpackhi_epi16(tmp1, tmp2); // ++ vst1q_u8((ptr + 48), res); ++} ++#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val) ++ ++//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val) ++{ ++ uint16x8x4_t v; ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2); ++ v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2); ++ tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2); ++ v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2); ++ vst1q_u16(ptr, v.val[0]); ++ vst1q_u16((ptr + 8), v.val[1]); ++ vst1q_u16((ptr + 16),v.val[2]); ++ vst1q_u16((ptr + 24), v.val[3]); ++} ++#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val) ++ ++//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val) ++{ ++ uint16x8x4_t v; ++ __m128i tmp1, tmp2; ++ tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2); ++ v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2); ++ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 ++ tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 ++ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2); ++ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2); ++ vst1q_u32(ptr, v.val[0]); ++ vst1q_u32((ptr + 4), v.val[1]); ++ vst1q_u32((ptr + 8), v.val[2]); ++ vst1q_u32((ptr + 12), v.val[3]); ++} ++#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val) ++ ++//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val); ++void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); ++#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val) ++ ++//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val); ++void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); ++#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val) ++ ++//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val); ++void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); ++#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val) ++ ++//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] ++_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val) ++{ ++ __m128 tmp3, tmp2, tmp1, tmp0; ++ float32x4x4_t v; ++ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); ++ tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]); ++ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); ++ tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]); ++ v.val[0] = _mm_movelh_ps(tmp0, tmp2); ++ v.val[1] = _mm_movehl_ps(tmp2, tmp0); ++ v.val[2] = _mm_movelh_ps(tmp1, tmp3); ++ v.val[3] = _mm_movehl_ps(tmp3, tmp1); ++ vst1q_f32(ptr, v.val[0]); ++ vst1q_f32((ptr + 4), v.val[1]); ++ vst1q_f32((ptr + 8), v.val[2]); ++ vst1q_f32((ptr + 12), v.val[3]); ++} ++#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val) ++ ++//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0] ++void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); ++#define vst4q_p8 vst4q_u8 ++ ++//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] ++void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); ++#define vst4q_p16 vst4q_s16 ++ ++//void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val) ++{ ++ __m128i sh0, sh1, val0, val2; ++ sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7, ++ sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7 ++ val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3, ++ val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7 ++ vst1q_u8(ptr, val0); ++ vst1q_u8((ptr + 16), val2); ++} ++#define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val) ++ ++//void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val) ++{ ++ __m128i sh0, sh1, val0, val2; ++ sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1, ++ sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3 ++ val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3 ++ val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3 ++ vst1q_u16(ptr, val0); //store as 128 bit structure ++ vst1q_u16((ptr + 8), val2); ++} ++#define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val) ++ ++//void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val) ++{ ++ //0,4, 1,5, 2,6, 3,7 ++ __m128i sh0, sh1, val0, val1; ++ sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5 ++ sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7 ++ val0 = _mm_unpacklo_epi64(sh0,sh1); // ++ val1 = _mm_unpackhi_epi64(sh0,sh1); // ++ vst1q_u32(ptr, val0); //store as 128 bit structure ++ vst1q_u32((ptr + 4), val1); ++} ++#define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val) ++ ++//void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val) ++{ ++ *(ptr) = val->val[0].m64_u64[0]; ++ *(ptr + 1) = val->val[1].m64_u64[0]; ++ *(ptr + 2) = val->val[2].m64_u64[0]; ++ *(ptr + 3) = val->val[3].m64_u64[0]; ++} ++#define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val) ++ ++//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0] ++#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val) ++ ++//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0] ++#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val) ++ ++//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0] ++#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val) ++ ++//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] ++void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); ++#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val) ++ ++//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); ++// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example ++ ++//void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] ++_NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val) ++{ ++ //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7 ++ *(ptr) = val->val[0].m64_f32[0]; ++ *(ptr + 1) = val->val[1].m64_f32[0]; ++ *(ptr + 2) = val->val[2].m64_f32[0]; ++ *(ptr + 3) = val->val[3].m64_f32[0]; ++ *(ptr + 4) = val->val[0].m64_f32[1]; ++ *(ptr + 5) = val->val[1].m64_f32[1]; ++ *(ptr + 6) = val->val[2].m64_f32[1]; ++ *(ptr + 7) = val->val[3].m64_f32[1]; ++} ++#define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val) ++ ++//void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0] ++void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); ++#define vst4_p8 vst4_u8 ++ ++//void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] ++void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); ++#define vst4_p16 vst4_u16 ++ ++//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors ********************* ++//******************************************************************************************************************** ++//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane) ++{ ++ vst1q_lane_s16(ptr, val->val[0], lane); ++ vst1q_lane_s16((ptr + 1), val->val[1], lane); ++} ++#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_u32(ptr, val->val[0], lane); ++ vst1q_lane_u32((ptr + 1), val->val[1], lane); ++} ++#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] ++void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); ++#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane) ++ ++//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0] ++void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); ++#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane) ++ ++//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] ++void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_f32(ptr, val->val[0], lane); ++ vst1q_lane_f32((ptr + 1), val->val[1], lane); ++} ++#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane) ++ ++//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] ++void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); ++#define vst2q_lane_p16 vst2q_lane_s16 ++ ++//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] ++void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0] ++{ ++ *(ptr) = val->val[0].m64_u8[lane]; ++ *(ptr + 1) = val->val[1].m64_u8[lane]; ++} ++#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane) ++ ++//void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] ++void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val->val[0].m64_u16[lane]; ++ *(ptr + 1) = val->val[1].m64_u16[lane]; ++} ++#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] ++void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_u32[lane]; ++ *(ptr + 1) = val->val[1].m64_u32[lane]; ++} ++#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] ++void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); ++#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane) ++ ++//void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] ++void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); ++#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane) ++ ++//void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] ++void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); ++#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane) ++ ++//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] ++//current IA SIMD doesn't support float16 ++ ++void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] ++_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_f32[lane]; ++ *(ptr + 1) = val->val[1].m64_f32[lane]; ++} ++#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane) ++ ++//void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] ++#define vst2_lane_p8 vst2_lane_u8 ++ ++//void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] ++#define vst2_lane_p16 vst2_lane_u16 ++ ++//************************* Triple lanes stores ******************************************************* ++//******************************************************************************************************* ++//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane) ++{ ++ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane); ++ vst1q_lane_u16((ptr + 2), val->val[2], lane); ++} ++#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane) ++{ ++ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane); ++ vst1q_lane_u32((ptr + 2), val->val[2], lane); ++} ++#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); ++#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane) ++ ++//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); ++#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane) ++ ++//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] ++_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_f32(ptr, val->val[0], lane); ++ vst1q_lane_f32((ptr + 1), val->val[1], lane); ++ vst1q_lane_f32((ptr + 2), val->val[2], lane); ++} ++#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] ++void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); ++#define vst3q_lane_p16 vst3q_lane_s16 ++ ++//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane) ++{ ++ *(ptr) = val->val[0].m64_u8[lane]; ++ *(ptr + 1) = val->val[1].m64_u8[lane]; ++ *(ptr + 2) = val->val[2].m64_u8[lane]; ++} ++#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane) ++ ++//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val->val[0].m64_u16[lane]; ++ *(ptr + 1) = val->val[1].m64_u16[lane]; ++ *(ptr + 2) = val->val[2].m64_u16[lane]; ++} ++#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] ++_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_u32[lane]; ++ *(ptr + 1) = val->val[1].m64_u32[lane]; ++ *(ptr + 2) = val->val[2].m64_u32[lane]; ++} ++#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); ++#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane) ++ ++//void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); ++#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane) ++ ++//void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); ++#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane) ++ ++//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); ++_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_f32[lane]; ++ *(ptr + 1) = val->val[1].m64_f32[lane]; ++ *(ptr + 2) = val->val[2].m64_f32[lane]; ++} ++#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); ++#define vst3_lane_p8 vst3_lane_u8 ++ ++//void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] ++void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); ++#define vst3_lane_p16 vst3_lane_s16 ++ ++//******************************** Quadruple lanes stores *********************************************** ++//******************************************************************************************************* ++//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane) ++{ ++ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane); ++ vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane); ++} ++#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane) ++{ ++ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane); ++ vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane); ++} ++#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); ++#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane) ++ ++//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); ++#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane) ++ ++//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); ++//current IA SIMD doesn't support float16 ++ ++//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane) ++{ ++ vst1q_lane_f32(ptr, val->val[0], lane); ++ vst1q_lane_f32((ptr + 1), val->val[1], lane); ++ vst1q_lane_f32((ptr + 2), val->val[2], lane); ++ vst1q_lane_f32((ptr + 3), val->val[3], lane); ++} ++#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] ++void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); ++#define vst4q_lane_p16 vst4q_lane_u16 ++ ++//void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane) ++{ ++ *(ptr) = val->val[0].m64_u8[lane]; ++ *(ptr + 1) = val->val[1].m64_u8[lane]; ++ *(ptr + 2) = val->val[2].m64_u8[lane]; ++ *(ptr + 3) = val->val[3].m64_u8[lane]; ++} ++#define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane) ++ ++//void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane) ++{ ++ *(ptr) = val->val[0].m64_u16[lane]; ++ *(ptr + 1) = val->val[1].m64_u16[lane]; ++ *(ptr + 2) = val->val[2].m64_u16[lane]; ++ *(ptr + 3) = val->val[3].m64_u16[lane]; ++} ++#define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane) ++ ++//void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_u32[lane]; ++ *(ptr + 1) = val->val[1].m64_u32[lane]; ++ *(ptr + 2) = val->val[2].m64_u32[lane]; ++ *(ptr + 3) = val->val[3].m64_u32[lane]; ++} ++#define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane) ++ ++//void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane) ++ ++//void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane) ++ ++//void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane) ++ ++//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); ++//current IA SIMD doesn't support float16 ++ ++void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane) ++{ ++ *(ptr) = val->val[0].m64_f32[lane]; ++ *(ptr + 1) = val->val[1].m64_f32[lane]; ++ *(ptr + 2) = val->val[2].m64_f32[lane]; ++ *(ptr + 3) = val->val[3].m64_f32[lane]; ++} ++#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane) ++ ++//void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); ++#define vst4_lane_p8 vst4_lane_u8 ++ ++//void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] ++void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); ++#define vst4_lane_p16 vst4_lane_u16 ++ ++//************************************************************************************************** ++//************************ Extract lanes from a vector ******************************************** ++//************************************************************************************************** ++//These intrinsics extract a single lane (element) from a vector. ++uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++#define vget_lane_u8(vec, lane) vec.m64_u8[lane] ++ ++uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] ++#define vget_lane_u16(vec, lane) vec.m64_u16[lane] ++ ++ ++uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++#define vget_lane_u32(vec, lane) vec.m64_u32[lane] ++ ++int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] ++#define vget_lane_s8(vec, lane) vec.m64_i8[lane] ++ ++int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] ++#define vget_lane_s16(vec, lane) vec.m64_i16[lane] ++ ++int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++#define vget_lane_s32(vec, lane) vec.m64_i32[lane] ++ ++poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] ++#define vget_lane_p8 vget_lane_u8 ++ ++poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] ++#define vget_lane_p16 vget_lane_u16 ++ ++float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] ++#define vget_lane_f32(vec, lane) vec.m64_f32[lane] ++ ++uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++#define vgetq_lane_u8 _MM_EXTRACT_EPI8 ++ ++uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] ++#define vgetq_lane_u16 _MM_EXTRACT_EPI16 ++ ++uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++#define vgetq_lane_u32 _MM_EXTRACT_EPI32 ++ ++int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] ++#define vgetq_lane_s8 vgetq_lane_u8 ++ ++int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] ++#define vgetq_lane_s16 vgetq_lane_u16 ++ ++int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++#define vgetq_lane_s32 vgetq_lane_u32 ++ ++poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] ++#define vgetq_lane_p8 vgetq_lane_u8 ++ ++poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] ++#define vgetq_lane_p16 vgetq_lane_u16 ++ ++float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] ++_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane) ++{ ++ int32_t ilane; ++ ilane = _MM_EXTRACT_PS(vec,lane); ++ return *(float*)&ilane; ++} ++ ++int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++#define vget_lane_s64(vec, lane) vec.m64_i64[0] ++ ++uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 ++#define vget_lane_u64(vec, lane) vec.m64_u64[0] ++ ++ ++int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++#define vgetq_lane_s64 (int64_t) vgetq_lane_u64 ++ ++uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 ++#define vgetq_lane_u64 _MM_EXTRACT_EPI64 ++ ++// ***************** Set lanes within a vector ******************************************** ++// ************************************************************************************** ++//These intrinsics set a single lane (element) within a vector. ++//same functions as vld1_lane_xx ones, but take the value to be set directly. ++ ++uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane) ++{ ++ uint8_t val; ++ val = value; ++ return vld1_lane_u8(&val, vec, lane); ++} ++ ++uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane) ++{ ++ uint16_t val; ++ val = value; ++ return vld1_lane_u16(&val, vec, lane); ++} ++ ++uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane) ++{ ++ uint32_t val; ++ val = value; ++ return vld1_lane_u32(&val, vec, lane); ++} ++ ++int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane) ++{ ++ int8_t val; ++ val = value; ++ return vld1_lane_s8(&val, vec, lane); ++} ++ ++int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane) ++{ ++ int16_t val; ++ val = value; ++ return vld1_lane_s16(&val, vec, lane); ++} ++ ++int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane) ++{ ++ int32_t val; ++ val = value; ++ return vld1_lane_s32(&val, vec, lane); ++} ++ ++poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 ++#define vset_lane_p8 vset_lane_u8 ++ ++poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 ++#define vset_lane_p16 vset_lane_u16 ++ ++float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane) ++{ ++ float32_t val; ++ val = value; ++ return vld1_lane_f32(&val, vec, lane); ++} ++ ++uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane) ++{ ++ uint8_t val; ++ val = value; ++ return vld1q_lane_u8(&val, vec, lane); ++} ++ ++uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane) ++{ ++ uint16_t val; ++ val = value; ++ return vld1q_lane_u16(&val, vec, lane); ++} ++ ++uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane) ++{ ++ uint32_t val; ++ val = value; ++ return vld1q_lane_u32(&val, vec, lane); ++} ++ ++int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane) ++{ ++ int8_t val; ++ val = value; ++ return vld1q_lane_s8(&val, vec, lane); ++} ++ ++int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane) ++{ ++ int16_t val; ++ val = value; ++ return vld1q_lane_s16(&val, vec, lane); ++} ++ ++int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane) ++{ ++ int32_t val; ++ val = value; ++ return vld1q_lane_s32(&val, vec, lane); ++} ++ ++poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 ++#define vsetq_lane_p8 vsetq_lane_u8 ++ ++poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 ++#define vsetq_lane_p16 vsetq_lane_u16 ++ ++float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 ++_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane) ++{ ++ float32_t val; ++ val = value; ++ return vld1q_lane_f32(&val, vec, lane); ++} ++ ++int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane) ++{ ++ int64_t val; ++ val = value; ++ return vld1_lane_s64(&val, vec, lane); ++} ++ ++uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane) ++{ ++ uint64_t val; ++ val = value; ++ return vld1_lane_u64(&val, vec, lane); ++} ++ ++int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane) ++{ ++ uint64_t val; ++ val = value; ++ return vld1q_lane_s64(&val, vec, lane); ++} ++ ++uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 ++#define vsetq_lane_u64 vsetq_lane_s64 ++ ++// ******************************************************************************* ++// **************** Initialize a vector from bit pattern *************************** ++// ******************************************************************************* ++//These intrinsics create a vector from a literal bit pattern. ++int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s8(a) (*(__m64_128*)&(a)) ++ ++ ++int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s16 vcreate_s8 ++ ++int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s32 vcreate_s8 ++ ++float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 ++//no IA32 SIMD avalilable ++ ++float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_f32(a) (*(__m64_128*)&(a)) ++ ++uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u8 vcreate_s8 ++ ++uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u16 vcreate_s16 ++ ++uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u32 vcreate_s32 ++ ++uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_u64 vcreate_s8 ++ ++ ++poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_p8 vcreate_u8 ++ ++poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_p16 vcreate_u16 ++ ++int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 ++#define vcreate_s64 vcreate_u64 ++ ++//********************* Set all lanes to same value ******************************** ++//********************************************************************************* ++//These intrinsics set all lanes to the same value. ++uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint8x8_t res; ++ int i; ++ for (i = 0; i<8; i++) { ++ res.m64_u8[i] = value; ++ } ++ return res; ++} ++ ++uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint16x4_t res; ++ int i; ++ for (i = 0; i<4; i++) { ++ res.m64_u16[i] = value; ++ } ++ return res; ++} ++ ++uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = value; ++ res.m64_u32[1] = value; ++ return res; ++} ++ ++int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int8x8_t res; ++ int i; ++ for (i = 0; i<8; i++) { ++ res.m64_i8[i] = value; ++ } ++ return res; ++} ++ ++int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int16x4_t res; ++ int i; ++ for (i = 0; i<4; i++) { ++ res.m64_i16[i] = value; ++ } ++ return res; ++} ++ ++int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t res; ++ res.m64_i32[0] = value; ++ res.m64_i32[1] = value; ++ return res; ++} ++ ++poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 ++#define vdup_n_p8 vdup_n_u8 ++ ++poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 ++#define vdup_n_p16 vdup_n_s16 ++ ++float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 ++_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = value; ++ res.m64_f32[1] = value; ++ return res; ++} ++ ++uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value)) ++ ++uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value)) ++ ++uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value)) ++ ++int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 ++#define vdupq_n_s8 _mm_set1_epi8 ++ ++int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 ++#define vdupq_n_s16 _mm_set1_epi16 ++ ++int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 ++#define vdupq_n_s32 _mm_set1_epi32 ++ ++poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++#define vdupq_n_p8 vdupq_n_u8 ++ ++poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++#define vdupq_n_p16 vdupq_n_u16 ++ ++float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 ++#define vdupq_n_f32 _mm_set1_ps ++ ++int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = value; ++ return res; ++} ++ ++uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value) ++{ ++ uint64x1_t res; ++ res.m64_u64[0] = value; ++ return res; ++} ++ ++int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value) ++{ ++ _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate ++ return LOAD_SI128(value2); ++} ++ ++uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value) ++{ ++ _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate ++ return LOAD_SI128(val); ++} ++ ++//**** Set all lanes to same value ************************ ++//Same functions as above - just aliaces.******************** ++//Probably they reflect the fact that 128-bit functions versions use VMOV instruction ********** ++uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 ++#define vmov_n_u8 vdup_n_s8 ++ ++uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 ++#define vmov_n_u16 vdup_n_s16 ++ ++uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 ++#define vmov_n_u32 vdup_n_u32 ++ ++int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 ++#define vmov_n_s8 vdup_n_s8 ++ ++int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 ++#define vmov_n_s16 vdup_n_s16 ++ ++int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 ++#define vmov_n_s32 vdup_n_s32 ++ ++poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 ++#define vmov_n_p8 vdup_n_u8 ++ ++poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 ++#define vmov_n_p16 vdup_n_s16 ++ ++float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 ++#define vmov_n_f32 vdup_n_f32 ++ ++uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 ++#define vmovq_n_u8 vdupq_n_u8 ++ ++uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 ++#define vmovq_n_u16 vdupq_n_s16 ++ ++uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 ++#define vmovq_n_u32 vdupq_n_u32 ++ ++int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 ++#define vmovq_n_s8 vdupq_n_s8 ++ ++int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 ++#define vmovq_n_s16 vdupq_n_s16 ++ ++int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 ++#define vmovq_n_s32 vdupq_n_s32 ++ ++poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 ++#define vmovq_n_p8 vdupq_n_u8 ++ ++poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 ++#define vmovq_n_p16 vdupq_n_s16 ++ ++float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 ++#define vmovq_n_f32 vdupq_n_f32 ++ ++int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 ++#define vmov_n_s64 vdup_n_s64 ++ ++uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 ++#define vmov_n_u64 vdup_n_u64 ++ ++int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 ++#define vmovq_n_s64 vdupq_n_s64 ++ ++uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 ++#define vmovq_n_u64 vdupq_n_u64 ++ ++//**************Set all lanes to the value of one lane of a vector ************* ++//**************************************************************************** ++//here shuffle is better solution than lane extraction followed by set1 function ++uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) ++{ ++ uint8x8_t res; ++ uint8_t valane; ++ int i = 0; ++ valane = vec.m64_u8[lane]; ++ for (i = 0; i<8; i++) { ++ res.m64_u8[i] = valane; ++ } ++ return res; ++} ++ ++uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) ++{ ++ uint16x4_t res; ++ uint16_t valane; ++ valane = vec.m64_u16[lane]; ++ res.m64_u16[0] = valane; ++ res.m64_u16[1] = valane; ++ res.m64_u16[2] = valane; ++ res.m64_u16[3] = valane; ++ return res; ++} ++ ++uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane) ++{ ++ uint32x2_t res; ++ res.m64_u32[0] = vec.m64_u32[lane]; ++ res.m64_u32[1] = res.m64_u32[0]; ++ return res; ++} ++ ++int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++#define vdup_lane_s8 vdup_lane_u8 ++ ++int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++#define vdup_lane_s16 vdup_lane_u16 ++ ++int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++#define vdup_lane_s32 vdup_lane_u32 ++ ++poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] ++#define vdup_lane_p8 vdup_lane_u8 ++ ++poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] ++#define vdup_lane_p16 vdup_lane_s16 ++ ++float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] ++_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = vec.m64_f32[lane]; ++ res.m64_f32[1] = res.m64_f32[0]; ++ return res; ++} ++ ++uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0] ++{ ++ _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane}; ++ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8); ++} ++ ++uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0] ++{ ++ //we could use 8bit shuffle for 16 bit as well ++ const int8_t lane16 = ((int8_t) lane) << 1; ++ _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, ++ lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1}; ++ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16); ++} ++ ++uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++#define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6)) ++ ++int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++#define vdupq_lane_s8 vdupq_lane_u8 ++ ++int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++#define vdupq_lane_s16 vdupq_lane_u16 ++ ++int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++#define vdupq_lane_s32 vdupq_lane_u32 ++ ++poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] ++#define vdupq_lane_p8 vdupq_lane_u8 ++ ++poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] ++#define vdupq_lane_p16 vdupq_lane_s16 ++ ++float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] ++#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane)) ++ ++int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++#define vdup_lane_s64(vec,lane) vec ++ ++uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 ++#define vdup_lane_u64(vec,lane) vec ++ ++int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane) ++{ ++ __m128i vec128; ++ vec128 = _pM128i(vec); ++ return _mm_unpacklo_epi64(vec128,vec128); ++} ++ ++uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 ++#define vdupq_lane_u64 vdupq_lane_s64 ++ ++// ******************************************************************** ++// ******************** Combining vectors ***************************** ++// ******************************************************************** ++//These intrinsics join two 64 bit vectors into a single 128bit vector. ++int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 ++#define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 ++#define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 ++#define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 ++#define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) ++ ++float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 ++//current IA SIMD doesn't support float16 ++ ++float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 ++_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high) ++{ ++ __m128i res; ++ res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) ); ++ return _M128(res); ++} ++ ++uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 ++#define vcombine_u8 vcombine_s8 ++ ++uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 ++#define vcombine_u16 vcombine_s16 ++ ++uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 ++#define vcombine_u32 vcombine_s32 ++ ++uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 ++#define vcombine_u64 vcombine_s64 ++ ++poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 ++#define vcombine_p8 vcombine_u8 ++ ++poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 ++#define vcombine_p16 vcombine_u16 ++ ++//********************************************************************** ++//************************* Splitting vectors ************************** ++//********************************************************************** ++//**************** Get high part ****************************************** ++//These intrinsics split a 128 bit vector into 2 component 64 bit vectors ++int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a) ++{ ++ int64x1_t res64; ++ __m128i res; ++ res = _mm_unpackhi_epi64(a,a); //SSE2 ++ return64(res); ++} ++ ++float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a) ++{ ++ __m128i res; ++ __m64_128 res64; ++ res = _mm_unpackhi_epi64(_M128i(a),_M128i(a)); ++ return64(res); ++} ++ ++uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 ++#define vget_high_u8 vget_high_s8 ++ ++uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 ++#define vget_high_u16 vget_high_s16 ++ ++uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 ++#define vget_high_u32 vget_high_s32 ++ ++uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 ++#define vget_high_u64 vget_high_s64 ++ ++poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 ++#define vget_high_p8 vget_high_u8 ++ ++poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 ++#define vget_high_p16 vget_high_u16 ++ ++//********************** Get low part ********************** ++//********************************************************** ++int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0 ++{ ++ int16x4_t res64; ++ return64(a); ++} ++ ++int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0 ++{ ++ int16x4_t res64; ++ return64(a); ++} ++ ++int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0 ++{ ++ int32x2_t res64; ++ return64(a); ++} ++ ++int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0 ++{ ++ int64x1_t res64; ++ return64 (a); ++} ++ ++float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 ++// IA32 SIMD doesn't work with 16bit floats currently ++ ++float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 ++_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a) ++{ ++ float32x2_t res64; ++ _M64f(res64, a); ++ return res64; ++} ++ ++uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 ++#define vget_low_u8 vget_low_s8 ++ ++uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 ++#define vget_low_u16 vget_low_s16 ++ ++uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 ++#define vget_low_u32 vget_low_s32 ++ ++uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 ++#define vget_low_u64 vget_low_s64 ++ ++poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 ++#define vget_low_p8 vget_low_u8 ++ ++poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 ++#define vget_low_p16 vget_low_s16 ++ ++//************************************************************************** ++//************************ Converting vectors ********************************** ++//************************************************************************** ++//************* Convert from float *************************************** ++// need to set _MM_SET_ROUNDING_MODE ( x) accordingly ++int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 ++_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only ++ return64(res); ++} ++ ++uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 ++_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a) ++{ ++ //may be not effective compared with a serial SIMD solution ++ uint32x2_t res64; ++ __m128i res; ++ res = vcvtq_u32_f32(_pM128(a)); ++ return64(res); ++} ++ ++int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 ++#define vcvtq_s32_f32 _mm_cvttps_epi32 ++ ++uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 ++_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0 ++{ ++ //No single instruction SSE solution but we could implement it as following: ++ __m128i resi; ++ __m128 zero, mask, a_pos, mask_f_max_si, res; ++ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; ++ zero = _mm_setzero_ps(); ++ mask = _mm_cmpgt_ps(a, zero); ++ a_pos = _mm_and_ps(a, mask); ++ mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff); ++ res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything ++ resi = _mm_cvttps_epi32(res); ++ return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si); ++} ++ ++// ***** Convert to the fixed point with the number of fraction bits specified by b *********** ++//************************************************************************************************* ++int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 ++_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b) ++{ ++ int32x2_t res64; ++ return64(vcvtq_n_s32_f32(_pM128(a),b)); ++} ++ ++uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 ++_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b) ++{ ++ uint32x2_t res; ++ float convconst; ++ convconst = (float)((uint32_t)1 << b); ++ res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst); ++ res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst); ++ return res; ++} ++ ++int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 ++_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; ++ __m128 cconst128; ++ __m128i mask, res; ++ convconst = (float)(1 << b); ++ cconst128 = vdupq_n_f32(convconst); ++ res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128)); ++ mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask); ++ return _mm_xor_si128 (res, mask); //res saturated for 0x80000000 ++} ++ ++uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 ++_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ __m128 cconst128; ++ convconst = (float)(1 << b); ++ cconst128 = vdupq_n_f32(convconst); ++ return vcvtq_u32_f32(_mm_mul_ps(a,cconst128)); ++} ++ ++//***************** Convert to float ************************* ++//************************************************************* ++float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 ++_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits ++{ ++ float32x2_t res; ++ res.m64_f32[0] = (float) a.m64_i32[0]; ++ res.m64_f32[1] = (float) a.m64_i32[1]; ++ return res; ++} ++ ++float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 ++_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = (float) a.m64_u32[0]; ++ res.m64_f32[1] = (float) a.m64_u32[1]; ++ return res; ++} ++ ++float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 ++#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a) ++ ++float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 ++_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0 ++{ ++ //solution may be not optimal ++ __m128 two16, fHi, fLo; ++ __m128i hi, lo; ++ two16 = _mm_set1_ps((float)0x10000); //2^16 ++ // Avoid double rounding by doing two exact conversions ++ // of high and low 16-bit segments ++ hi = _mm_srli_epi32(a, 16); ++ lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16); ++ fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16); ++ fLo = _mm_cvtepi32_ps(lo); ++ // do single rounding according to current rounding mode ++ return _mm_add_ps(fHi, fLo); ++} ++ ++// ***** Convert to the float from fixed point with the number of fraction bits specified by b *********** ++float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 ++_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b) ++{ ++ float32x2_t res; ++ float convconst; ++ convconst = (float)(1. / ((uint32_t)1 << b)); ++ res.m64_f32[0] = a.m64_i32[0] * convconst; ++ res.m64_f32[1] = a.m64_i32[1] * convconst; ++ return res; ++} ++ ++float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 ++_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32 ++{ ++ float32x2_t res; ++ float convconst; ++ convconst = (float)(1. / ((uint32_t)1 << b)); ++ res.m64_f32[0] = a.m64_u32[0] * convconst; ++ res.m64_f32[1] = a.m64_u32[1] * convconst; ++ return res; ++} ++ ++float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 ++_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ __m128 cconst128, af; ++ convconst = (float)(1. / ((uint32_t)1 << b)); ++ af = _mm_cvtepi32_ps(a); ++ cconst128 = vdupq_n_f32(convconst); ++ return _mm_mul_ps(af,cconst128); ++} ++ ++float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 ++_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b) ++{ ++ float convconst; ++ __m128 cconst128, af; ++ convconst = (float)(1. / (1 << b)); ++ af = vcvtq_f32_u32(a); ++ cconst128 = vdupq_n_f32(convconst); ++ return _mm_mul_ps(af,cconst128); ++} ++ ++//**************Convert between floats *********************** ++//************************************************************ ++float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 ++//Intel SIMD doesn't support 16bits floats curently ++ ++float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 ++//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits ++ ++//************Vector narrow integer conversion (truncation) ****************** ++//**************************************************************************** ++int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 ++_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0 ++{ ++ int8x8_t res64; ++ __m128i res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; ++ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only ++ return64(res); ++} ++ ++int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 ++_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0 ++{ ++ int16x4_t res64; ++ __m128i res; ++ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; ++ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only ++ return64(res); ++} ++ ++int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 ++_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a) ++{ ++ //may be not effective compared with a serial implementation ++ int32x2_t res64; ++ __m128i res; ++ res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0) ++ return64(res); ++} ++ ++uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 ++#define vmovn_u16 vmovn_s16 ++ ++uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 ++#define vmovn_u32 vmovn_s32 ++ ++uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 ++#define vmovn_u64 vmovn_s64 ++ ++//**************** Vector long move *********************** ++//*********************************************************** ++int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 ++#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1 ++ ++int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 ++#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1 ++ ++int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 ++#define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1 ++ ++uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 ++#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1 ++ ++uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0 ++#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1 ++ ++uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 ++#define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1 ++ ++//*************Vector saturating narrow integer***************** ++//************************************************************** ++int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 ++_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = _mm_packs_epi16(a, a); ++ return64(res); ++} ++ ++int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 ++_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = _mm_packs_epi32(a, a); ++ return64(res); ++} ++ ++int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution ++{ ++ int32x2_t res; ++ _NEON2SSE_ALIGN_16 int64_t atmp[2]; ++ _mm_store_si128((__m128i*)atmp, a); ++ if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX; ++ if(atmp[0]SINT_MAX) atmp[1] = SINT_MAX; ++ if(atmp[1]32 bits numbers ++ res_hi = _mm_or_si128(a, mask); ++ res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(res_hi); ++} ++//************* Vector saturating narrow integer signed->unsigned ************** ++//***************************************************************************** ++uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 ++_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a) ++{ ++ uint8x8_t res64; ++ __m128i res; ++ res = _mm_packus_epi16(a, a); //use low 64bits only ++ return64(res); ++} ++ ++uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 ++_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a) ++{ ++ uint16x4_t res64; ++ __m128i res; ++ res = _MM_PACKUS1_EPI32(a); //use low 64bits only ++ return64(res); ++} ++ ++uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 ++_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a) ++{ ++ uint32x2_t res64; ++ __m128i res_hi,res_lo, zero, cmp; ++ zero = _mm_setzero_si128(); ++ res_hi = _mm_srli_epi64(a, 32); ++ cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero ++ res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 ++ cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive ++ res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff ++ res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits ++ return64(res_lo); ++} ++ ++// ******************************************************** ++// **************** Table look up ************************** ++// ******************************************************** ++//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values ++//in a table and generate a new vector. Indexes out of range return 0. ++//for Intel SIMD we need to set the MSB to 1 for zero return ++uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ __m128i c7, maskgt, bmask, b128; ++ c7 = _mm_set1_epi8 (7); ++ b128 = _pM128i(b); ++ maskgt = _mm_cmpgt_epi8(b128,c7); ++ bmask = _mm_or_si128(b128,maskgt); ++ bmask = _mm_shuffle_epi8(_pM128i(a),bmask); ++ return64(bmask); ++} ++ ++int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 ++#define vtbl1_s8 vtbl1_u8 ++ ++poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 ++#define vtbl1_p8 vtbl1_u8 ++ ++//Special trick to avoid __declspec(align('8')) won't be aligned" error ++//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b) ++{ ++ uint8x8_t res64; ++ __m128i c15, a01, maskgt15, bmask, b128; ++ c15 = _mm_set1_epi8 (15); ++ b128 = _pM128i(b); ++ maskgt15 = _mm_cmpgt_epi8(b128,c15); ++ bmask = _mm_or_si128(b128, maskgt15); ++ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1])); ++ a01 = _mm_shuffle_epi8(a01, bmask); ++ return64(a01); ++} ++#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b) ++ ++//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++#define vtbl2_s8 vtbl2_u8 ++ ++//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 ++#define vtbl2_p8 vtbl2_u8 ++ ++//Special trick to avoid __declspec(align('16')) won't be aligned" error ++//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128; ++ c15 = _mm_set1_epi8 (15); ++ c23 = _mm_set1_epi8 (23); ++ b128 = _pM128i(b); ++ maskgt23 = _mm_cmpgt_epi8(b128,c23); ++ bmask = _mm_or_si128(b128, maskgt23); ++ maskgt15 = _mm_cmpgt_epi8(b128,c15); ++ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); ++ sh0 = _mm_shuffle_epi8(a01, bmask); ++ sh1 = _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1 ++ return64(sh0); ++} ++#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b) ++ ++//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++#define vtbl3_s8 vtbl3_u8 ++ ++//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 ++#define vtbl3_p8 vtbl3_u8 ++ ++//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128; ++ c15 = _mm_set1_epi8 (15); ++ c31 = _mm_set1_epi8 (31); ++ b128 = _pM128i(b); ++ maskgt31 = _mm_cmpgt_epi8(b128,c31); ++ bmask = _mm_or_si128(b128, maskgt31); ++ maskgt15 = _mm_cmpgt_epi8(b128,c15); ++ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); ++ a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3])); ++ sh0 = _mm_shuffle_epi8(a01, bmask); ++ sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1 ++ return64(sh0); ++} ++#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b) ++ ++//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++#define vtbl4_s8 vtbl4_u8 ++ ++//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 ++#define vtbl4_p8 vtbl4_u8 ++ ++//****************** Extended table look up intrinsics *************************** ++//********************************************************************************** ++//VTBX (Vector Table Extension) works in the same way as VTBL do, ++// except that indexes out of range leave the destination element unchanged. ++ ++uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) ++{ ++ uint8x8_t res64; ++ __m128i c7, maskgt, sh, c128; ++ c7 = _mm_set1_epi8 (7); ++ c128 = _pM128i(c); ++ maskgt = _mm_cmpgt_epi8(c128,c7); ++ c7 = _mm_and_si128(maskgt,_pM128i(a)); ++ sh = _mm_shuffle_epi8(_pM128i(b),c128); ++ sh = _mm_andnot_si128(maskgt,sh); ++ sh = _mm_or_si128(sh,c7); ++ return64(sh); ++} ++ ++int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 ++#define vtbx1_s8 vtbx1_u8 ++ ++poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 ++#define vtbx1_p8 vtbx1_u8 ++ ++//Special trick to avoid __declspec(align('8')) won't be aligned" error ++//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c) ++{ ++ uint8x8_t res64; ++ __m128i c15, b01, maskgt15, sh, c128; ++ c15 = _mm_set1_epi8 (15); ++ c128 = _pM128i(c); ++ maskgt15 = _mm_cmpgt_epi8(c128, c15); ++ c15 = _mm_and_si128(maskgt15, _pM128i(a)); ++ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1])); ++ sh = _mm_shuffle_epi8(b01, c128); ++ sh = _mm_andnot_si128(maskgt15, sh); ++ sh = _mm_or_si128(sh,c15); ++ return64(sh); ++} ++#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c) ++ ++//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++#define vtbx2_s8 vtbx2_u8 ++ ++//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 ++#define vtbx2_p8 vtbx2_u8 ++ ++//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128; ++ c15 = _mm_set1_epi8 (15); ++ c23 = _mm_set1_epi8 (23); ++ c128 = _pM128i(c); ++ maskgt15 = _mm_cmpgt_epi8(c128,c15); ++ maskgt23 = _mm_cmpgt_epi8(c128,c23); ++ c23 = _mm_and_si128(maskgt23, _pM128i(a)); ++ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); ++ sh0 = _mm_shuffle_epi8(b01, c128); ++ sh1 = _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); ++ sh0 = _mm_andnot_si128(maskgt23,sh0); ++ sh0 = _mm_or_si128(sh0,c23); ++ return64(sh0); ++} ++#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c) ++ ++//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c); ++#define vtbx3_s8 vtbx3_u8 ++ ++//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 ++poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c); ++#define vtbx3_p8 vtbx3_u8 ++ ++//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c) ++{ ++ //solution may be not optimal ++ uint8x8_t res64; ++ __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128; ++ c15 = _mm_set1_epi8 (15); ++ c31 = _mm_set1_epi8 (31); ++ c128 = _pM128i(c); ++ maskgt15 = _mm_cmpgt_epi8(c128,c15); ++ maskgt31 = _mm_cmpgt_epi8(c128,c31); ++ c31 = _mm_and_si128(maskgt31, _pM128i(a)); ++ ++ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); ++ b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3])); ++ sh0 = _mm_shuffle_epi8(b01, c128); ++ sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15) ++ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); ++ sh0 = _mm_andnot_si128(maskgt31,sh0); ++ sh0 = _mm_or_si128(sh0,c31); ++ return64(sh0); ++} ++#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c) ++ ++//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c); ++#define vtbx4_s8 vtbx4_u8 ++ ++//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 ++poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c); ++#define vtbx4_p8 vtbx4_u8 ++ ++//************************************************************************************************* ++// *************************** Operations with a scalar value ********************************* ++//************************************************************************************************* ++ ++//******* Vector multiply accumulate by scalar ************************************************* ++//********************************************************************************************** ++int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0] ++{ ++ int16_t c; ++ int16x4_t scalar; ++ c = vget_lane_s16(v, l); ++ scalar = vdup_n_s16(c); ++ return vmla_s16(a, b, scalar); ++} ++ ++int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0] ++{ ++ int32_t c; ++ int32x2_t scalar; ++ c = vget_lane_s32(v, l); ++ scalar = vdup_n_s32(c); ++ return vmla_s32(a, b, scalar); ++} ++ ++uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] ++#define vmla_lane_u16 vmla_lane_s16 ++ ++ ++uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] ++#define vmla_lane_u32 vmla_lane_s32 ++ ++float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) ++{ ++ float32_t vlane; ++ float32x2_t c; ++ vlane = vget_lane_f32(v, l); ++ c = vdup_n_f32(vlane); ++ return vmla_f32(a,b,c); ++} ++ ++int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] ++{ ++ int16_t vlane; ++ int16x8_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdupq_n_s16(vlane); ++ return vmlaq_s16(a,b,c); ++} ++ ++int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] ++{ ++ int32_t vlane; ++ int32x4_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdupq_n_s32(vlane); ++ return vmlaq_s32(a,b,c); ++} ++ ++uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] ++#define vmlaq_lane_u16 vmlaq_lane_s16 ++ ++uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] ++#define vmlaq_lane_u32 vmlaq_lane_s32 ++ ++float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] ++{ ++ float32_t vlane; ++ float32x4_t c; ++ vlane = vget_lane_f32(v, l); ++ c = vdupq_n_f32(vlane); ++ return vmlaq_f32(a,b,c); ++} ++ ++//***************** Vector widening multiply accumulate by scalar ********************** ++//*************************************************************************************** ++int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmlal_s16(a, b, c); ++} ++ ++int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vmlal_s32(a, b, c); ++} ++ ++uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t c; ++ vlane = vget_lane_u16(v, l); ++ c = vdup_n_u16(vlane); ++ return vmlal_u16(a, b, c); ++} ++ ++uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdup_n_u32(vlane); ++ return vmlal_u32(a, b, c); ++} ++ ++// ******** Vector widening saturating doubling multiply accumulate by scalar ******************************* ++// ************************************************************************************************ ++int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vqdmlal_s16(a, b, c); ++} ++ ++int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) ++{ ++ int32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vqdmlal_s32(a, b, c); ++} ++ ++// ****** Vector multiply subtract by scalar ***************** ++// ************************************************************* ++int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmls_s16(a, b, c); ++} ++ ++int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vmls_s32(a, b, c); ++} ++ ++uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmls_s16(a, b, c); ++} ++ ++uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdup_n_u32(vlane); ++ return vmls_u32(a, b, c); ++} ++ ++float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) ++{ ++ float32_t vlane; ++ float32x2_t c; ++ vlane = (float) vget_lane_f32(v, l); ++ c = vdup_n_f32(vlane); ++ return vmls_f32(a,b,c); ++} ++ ++int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0] ++{ ++ int16_t vlane; ++ int16x8_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdupq_n_s16(vlane); ++ return vmlsq_s16(a, b,c); ++} ++ ++int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0] ++{ ++ int32_t vlane; ++ int32x4_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdupq_n_s32(vlane); ++ return vmlsq_s32(a,b,c); ++} ++ ++uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x8_t c; ++ vlane = vget_lane_u16(v, l); ++ c = vdupq_n_u16(vlane); ++ return vmlsq_u16(a,b,c); ++} ++ ++uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x4_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdupq_n_u32(vlane); ++ return vmlsq_u32(a,b,c); ++} ++ ++float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] ++{ ++ float32_t vlane; ++ float32x4_t c; ++ vlane = (float) vget_lane_f32(v, l); ++ c = vdupq_n_f32(vlane); ++ return vmlsq_f32(a,b,c); ++} ++ ++// **** Vector widening multiply subtract by scalar **** ++// **************************************************** ++int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmlsl_s16(a, b, c); ++} ++ ++int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vmlsl_s32(a, b, c); ++} ++ ++uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vmlsl_s16(a, b, c); ++} ++ ++uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t c; ++ vlane = vget_lane_u32(v, l); ++ c = vdup_n_u32(vlane); ++ return vmlsl_u32(a, b, c); ++} ++ ++//********* Vector widening saturating doubling multiply subtract by scalar ************************** ++//****************************************************************************************************** ++int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) ++{ ++ int16_t vlane; ++ int16x4_t c; ++ vlane = vget_lane_s16(v, l); ++ c = vdup_n_s16(vlane); ++ return vqdmlsl_s16(a, b, c); ++} ++ ++int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32_t vlane; ++ int32x2_t c; ++ vlane = vget_lane_s32(v, l); ++ c = vdup_n_s32(vlane); ++ return vqdmlsl_s32(a, b, c); ++} ++//********** Vector multiply with scalar ***************************** ++int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0] ++{ ++ int16x4_t b16x4; ++ b16x4 = vdup_n_s16(b); ++ return vmul_s16(a, b16x4); ++} ++ ++int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] ++_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0] ++{ ++ //serial solution looks faster ++ int32x2_t b32x2; ++ b32x2 = vdup_n_s32(b); ++ return vmul_s32(a, b32x2); ++} ++ ++float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] ++_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0] ++{ ++ float32x2_t b32x2; ++ b32x2 = vdup_n_f32(b); ++ return vmul_f32(a, b32x2); ++} ++ ++uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] ++_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0] ++{ ++ uint16x4_t b16x4; ++ b16x4 = vdup_n_s16(b); ++ return vmul_s16(a, b16x4); ++} ++ ++uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] ++_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0] ++{ ++ //serial solution looks faster ++ uint32x2_t b32x2; ++ b32x2 = vdup_n_u32(b); ++ return vmul_u32(a, b32x2); ++} ++ ++int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0] ++{ ++ int16x8_t b16x8; ++ b16x8 = vdupq_n_s16(b); ++ return vmulq_s16(a, b16x8); ++} ++ ++int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] ++_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0] ++{ ++ int32x4_t b32x4; ++ b32x4 = vdupq_n_s32(b); ++ return vmulq_s32(a, b32x4); ++} ++ ++float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] ++_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0] ++{ ++ float32x4_t b32x4; ++ b32x4 = vdupq_n_f32(b); ++ return vmulq_f32(a, b32x4); ++} ++ ++uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] ++_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0] ++{ ++ uint16x8_t b16x8; ++ b16x8 = vdupq_n_s16(b); ++ return vmulq_s16(a, b16x8); ++} ++ ++uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] ++_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0] ++{ ++ uint32x4_t b32x4; ++ b32x4 = vdupq_n_u32(b); ++ return vmulq_u32(a, b32x4); ++} ++ ++//********** Vector multiply lane ***************************** ++int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); ++_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c) ++{ ++ int16x4_t b16x4; ++ int16_t vlane; ++ vlane = vget_lane_s16(b, c); ++ b16x4 = vdup_n_s16(vlane); ++ return vmul_s16(a, b16x4); ++} ++ ++int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c) ++{ ++ int32x2_t b32x2; ++ int32_t vlane; ++ vlane = vget_lane_s32(b, c); ++ b32x2 = vdup_n_s32(vlane); ++ return vmul_s32(a, b32x2); ++} ++ ++float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c) ++{ ++ float32x2_t b32x2; ++ float32_t vlane; ++ vlane = vget_lane_f32(b, c); ++ b32x2 = vdup_n_f32(vlane); ++ return vmul_f32(a, b32x2); ++} ++ ++uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); ++#define vmul_lane_u16 vmul_lane_s16 ++ ++uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); ++#define vmul_lane_u32 vmul_lane_s32 ++ ++int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c); ++_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c) ++{ ++ int16x8_t b16x8; ++ int16_t vlane; ++ vlane = vget_lane_s16(b, c); ++ b16x8 = vdupq_n_s16(vlane); ++ return vmulq_s16(a, b16x8); ++} ++ ++int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c) ++{ ++ int32x4_t b32x4; ++ int32_t vlane; ++ vlane = vget_lane_s32(b, c); ++ b32x4 = vdupq_n_s32(vlane); ++ return vmulq_s32(a, b32x4); ++} ++ ++float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); ++_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c) ++{ ++ float32x4_t b32x4; ++ float32_t vlane; ++ vlane = vget_lane_f32(b, c); ++ b32x4 = vdupq_n_f32(vlane); ++ return vmulq_f32(a, b32x4); ++} ++ ++uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); ++#define vmulq_lane_u16 vmulq_lane_s16 ++ ++uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); ++#define vmulq_lane_u32 vmulq_lane_s32 ++ ++//**** Vector long multiply with scalar ************ ++int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0] ++{ ++ int16x4_t b16x4; ++ b16x4 = vdup_n_s16(val2); ++ return vmull_s16(vec1, b16x4); ++} ++ ++int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0] ++{ ++ int32x2_t b32x2; ++ b32x2 = vdup_n_s32(val2); ++ return vmull_s32(vec1, b32x2); ++} ++ ++uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0] ++_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0] ++{ ++ uint16x4_t b16x4; ++ b16x4 = vdup_n_s16(val2); ++ return vmull_s16(vec1, b16x4); ++} ++ ++uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] ++_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0] ++{ ++ uint32x2_t b32x2; ++ b32x2 = vdup_n_u32(val2); ++ return vmull_u32(vec1, b32x2); ++} ++ ++//**** Vector long multiply by scalar **** ++int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0] ++{ ++ int16_t vlane; ++ int16x4_t b; ++ vlane = vget_lane_s16(val2, val3); ++ b = vdup_n_s16(vlane); ++ return vmull_s16(vec1, b); ++} ++ ++int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0] ++{ ++ int32_t vlane; ++ int32x2_t b; ++ vlane = vget_lane_s32(val2, val3); ++ b = vdup_n_s32(vlane); ++ return vmull_s32(vec1, b); ++} ++ ++uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0] ++_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0] ++{ ++ uint16_t vlane; ++ uint16x4_t b; ++ vlane = vget_lane_s16(val2, val3); ++ b = vdup_n_s16(vlane); ++ return vmull_s16(vec1, b); ++} ++ ++uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] ++_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0] ++{ ++ uint32_t vlane; ++ uint32x2_t b; ++ vlane = vget_lane_u32(val2, val3); ++ b = vdup_n_u32(vlane); ++ return vmull_u32(vec1, b); ++} ++ ++//********* Vector saturating doubling long multiply with scalar ******************* ++int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2) ++{ ++ //the serial soulution may be faster due to saturation ++ int16x4_t b; ++ b = vdup_n_s16(val2); ++ return vqdmull_s16(vec1, b); ++} ++ ++int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t b; ++ b = vdup_n_s32(val2); ++ return vqdmull_s32(vec1,b); //slow serial function!!!! ++} ++ ++//************* Vector saturating doubling long multiply by scalar *********************************************** ++int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) ++{ ++ int16_t c; ++ int16x4_t scalar; ++ c = vget_lane_s16(val2, val3); ++ scalar = vdup_n_s16(c); ++ return vqdmull_s16(vec1, scalar); ++} ++ ++ ++int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32_t c; ++ int32x2_t scalar; ++ c = vget_lane_s32(val2, val3); ++ scalar = vdup_n_s32(c); ++ return vqdmull_s32(vec1,scalar); //slow serial function!!!! ++} ++ ++// *****Vector saturating doubling multiply high with scalar ***** ++int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2) ++{ ++ int16x4_t res64; ++ return64(vqdmulhq_n_s16(_pM128i(vec1), val2)); ++} ++ ++int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2) ++{ ++ int32x2_t res64; ++ return64(vqdmulhq_n_s32(_pM128i(vec1), val2)); ++} ++ ++int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16x8_t scalar; ++ scalar = vdupq_n_s16(val2); ++ return vqdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32x4_t scalar; ++ scalar = vdupq_n_s32(val2); ++ return vqdmulhq_s32(vec1, scalar); ++} ++ ++//***** Vector saturating doubling multiply high by scalar **************** ++int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x4_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdup_n_s16(vlane); ++ return vqdmulh_s16(vec1, scalar); ++} ++ ++int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32_t vlane; ++ int32x2_t scalar; ++ vlane = vget_lane_s32(val2, val3); ++ scalar = vdup_n_s32(vlane); ++ return vqdmulh_s32(vec1, scalar); ++} ++ ++int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x8_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdupq_n_s16(vlane ); ++ return vqdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //solution may be not optimal ++ int32_t vlane; ++ int32x4_t scalar; ++ vlane = vgetq_lane_s32(_pM128i(val2), val3); ++ scalar = vdupq_n_s32(vlane ); ++ return vqdmulhq_s32(vec1, scalar); ++} ++ ++//******** Vector saturating rounding doubling multiply high with scalar *** ++int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0] ++{ ++ //solution may be not optimal ++ int16x4_t scalar; ++ scalar = vdup_n_s16(val2); ++ return vqrdmulh_s16(vec1, scalar); ++} ++ ++int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32x2_t scalar; ++ scalar = vdup_n_s32(val2); ++ return vqrdmulh_s32(vec1, scalar); ++} ++ ++int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16x8_t scalar; ++ scalar = vdupq_n_s16(val2); ++ return vqrdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32x4_t scalar; ++ scalar = vdupq_n_s32(val2); ++ return vqrdmulhq_s32(vec1, scalar); ++} ++ ++//********* Vector rounding saturating doubling multiply high by scalar **** ++int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] ++_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x4_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdup_n_s16(vlane); ++ return vqrdmulh_s16(vec1, scalar); ++} ++ ++int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ int32_t vlane; ++ int32x2_t scalar; ++ vlane = vget_lane_s32(val2, val3); ++ scalar = vdup_n_s32(vlane); ++ return vqrdmulh_s32(vec1, scalar); ++} ++ ++int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] ++_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0] ++{ ++ //solution may be not optimal ++ int16_t vlane; ++ int16x8_t scalar; ++ vlane = vget_lane_s16(val2, val3); ++ scalar = vdupq_n_s16(vlane); ++ return vqrdmulhq_s16(vec1, scalar); ++} ++ ++int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) ++{ ++ //solution may be not optimal ++ int32_t vlane; ++ int32x4_t scalar; ++ vlane = vgetq_lane_s32(_pM128i(val2), val3); ++ scalar = vdupq_n_s32(vlane ); ++ return vqrdmulhq_s32(vec1, scalar); ++} ++ ++//**************Vector multiply accumulate with scalar ******************* ++int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0] ++{ ++ int16x4_t scalar; ++ scalar = vdup_n_s16(c); ++ return vmla_s16(a, b, scalar); ++} ++ ++int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0] ++{ ++ int32x2_t scalar; ++ scalar = vdup_n_s32(c); ++ return vmla_s32(a, b, scalar); ++} ++ ++uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] ++#define vmla_n_u16 vmla_n_s16 ++ ++ ++uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] ++#define vmla_n_u32 vmla_n_s32 ++ ++ ++float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0] ++{ ++ float32x2_t scalar; ++ scalar = vdup_n_f32(c); ++ return vmla_f32(a, b, scalar); ++} ++ ++int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0] ++{ ++ int16x8_t scalar; ++ scalar = vdupq_n_s16(c); ++ return vmlaq_s16(a,b,scalar); ++} ++ ++int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0] ++{ ++ int32x4_t scalar; ++ scalar = vdupq_n_s32(c); ++ return vmlaq_s32(a,b,scalar); ++} ++ ++uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] ++#define vmlaq_n_u16 vmlaq_n_s16 ++ ++uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] ++#define vmlaq_n_u32 vmlaq_n_s32 ++ ++float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0] ++{ ++ float32x4_t scalar; ++ scalar = vdupq_n_f32(c); ++ return vmlaq_f32(a,b,scalar); ++} ++ ++//************Vector widening multiply accumulate with scalar**************************** ++int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0] ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmlal_s16(a, b, vc); ++} ++ ++int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0] ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vmlal_s32(a, b, vc); ++} ++ ++uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0] ++{ ++ uint16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmlal_s16(a, b, vc); ++} ++ ++uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0] ++{ ++ uint32x2_t vc; ++ vc = vdup_n_u32(c); ++ return vmlal_u32(a, b, vc); ++} ++ ++//************ Vector widening saturating doubling multiply accumulate with scalar ************** ++int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) ++{ ++ //not optimal SIMD soulution, serial may be faster ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vqdmlal_s16(a, b, vc); ++} ++ ++int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vqdmlal_s32(a, b, vc); ++} ++ ++//******** Vector multiply subtract with scalar ************** ++int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0] ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmls_s16(a, b, vc); ++} ++ ++int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0] ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vmls_s32(a, b, vc); ++} ++ ++uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] ++_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0] ++{ ++ uint16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmls_s16(a, b, vc); ++} ++ ++uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] ++_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0] ++{ ++ uint32x2_t vc; ++ vc = vdup_n_u32(c); ++ return vmls_u32(a, b, vc); ++} ++ ++float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] ++_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) ++{ ++ float32x2_t res; ++ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c; ++ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c; ++ return res; ++} ++ ++int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0] ++{ ++ int16x8_t vc; ++ vc = vdupq_n_s16(c); ++ return vmlsq_s16(a, b,vc); ++} ++ ++int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0] ++{ ++ int32x4_t vc; ++ vc = vdupq_n_s32(c); ++ return vmlsq_s32(a,b,vc); ++} ++ ++uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] ++_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0] ++{ ++ uint32x4_t vc; ++ vc = vdupq_n_u32(c); ++ return vmlsq_u32(a,b,vc); ++} ++ ++uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0] ++{ ++ uint32x4_t vc; ++ vc = vdupq_n_u32(c); ++ return vmlsq_u32(a,b,vc); ++} ++ ++float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] ++_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) ++{ ++ float32x4_t vc; ++ vc = vdupq_n_f32(c); ++ return vmlsq_f32(a,b,vc); ++} ++ ++//**** Vector widening multiply subtract with scalar ****** ++int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0] ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vmlsl_s16(a, b, vc); ++} ++ ++int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0] ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vmlsl_s32(a, b, vc); ++} ++ ++uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0] ++_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0] ++{ ++ uint16x4_t vc; ++ vc = vdup_n_u16(c); ++ return vmlsl_u16(a, b, vc); ++} ++ ++uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] ++_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0] ++{ ++ uint32x2_t vc; ++ vc = vdup_n_u32(c); ++ return vmlsl_u32(a, b, vc); ++} ++ ++//***** Vector widening saturating doubling multiply subtract with scalar ********* ++//********************************************************************************** ++int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] ++_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) ++{ ++ int16x4_t vc; ++ vc = vdup_n_s16(c); ++ return vqdmlsl_s16(a, b, vc); ++} ++ ++int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int32x2_t vc; ++ vc = vdup_n_s32(c); ++ return vqdmlsl_s32(a, b, vc); ++} ++ ++//******************* Vector extract *********************************************** ++//************************************************************************************* ++//VEXT (Vector Extract) extracts elements from the bottom end of the second operand ++//vector and the top end of the first, concatenates them, and places the result in the destination vector ++//c elements from the bottom end of the second operand and (8-c) from the top end of the first ++int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 ++_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL) ++{ ++ int8x8_t res; ++ int i; ++ for (i = 0; i<8 - c; i++) { ++ res.m64_i8[i] = a.m64_i8[i + c]; ++ } ++ for(i = 0; i> 1); ++ tmp = _mm_srli_epi32(res, 2); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2); ++ tmp = _mm_srli_epi32(res, 4); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4); ++ tmp = _mm_srli_epi32(res, 8); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8); ++ tmp = _mm_srli_epi32(res, 16); ++ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16); ++ ++ tmp = _mm_srli_epi32(res, 1); ++ tmp = _mm_and_si128(tmp, c55555555); ++ res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555); ++ ++ tmp = _mm_srli_epi32(res, 2); ++ tmp = _mm_and_si128(tmp, c33333333); ++ tmp1 = _mm_and_si128(res, c33333333); ++ res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333)); ++ ++ tmp = _mm_srli_epi32(res, 4); ++ tmp = _mm_add_epi32(tmp, res); ++ res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f); ++ ++ tmp = _mm_srli_epi32(res, 8); ++ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8); ++ ++ tmp = _mm_srli_epi32(res, 16); ++ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16); ++ ++ res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f; ++ ++ return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i]; ++} ++ ++uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 ++#define vclzq_u8 vclzq_s8 ++ ++uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 ++#define vclzq_u16 vclzq_s16 ++ ++uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 ++#define vclzq_u32 vclzq_s32 ++ ++//************** Count leading sign bits ************************** ++//******************************************************************** ++//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following ++// the topmost bit, that are the same as the topmost bit, in each element in a vector ++//No corresponding vector intrinsics in IA32, need to implement it. ++//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits ++int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 ++_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = vclsq_s8(_pM128i(a)); ++ return64(res); ++} ++ ++int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 ++_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = vclsq_s16(_pM128i(a)); ++ return64(res); ++} ++ ++int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 ++_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = vclsq_s32(_pM128i(a)); ++ return64(res); ++} ++ ++int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 ++_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a) ++{ ++ __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb; ++ cff = _mm_cmpeq_epi8 (a,a); //0xff ++ c80 = _mm_set1_epi8(0x80); ++ c1 = _mm_set1_epi8(1); ++ a_mask = _mm_and_si128(a, c80); ++ a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive ++ a_neg = _mm_xor_si128(a, cff); ++ a_neg = _mm_and_si128(a_mask, a_neg); ++ a_pos = _mm_andnot_si128(a_mask, a); ++ a_comb = _mm_or_si128(a_pos, a_neg); ++ a_comb = vclzq_s8(a_comb); ++ return _mm_sub_epi8(a_comb, c1); ++} ++ ++int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 ++_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a) ++{ ++ __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb; ++ cffff = _mm_cmpeq_epi16(a,a); ++ c8000 = _mm_slli_epi16(cffff, 15); //0x8000 ++ c1 = _mm_srli_epi16(cffff,15); //0x1 ++ a_mask = _mm_and_si128(a, c8000); ++ a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive ++ a_neg = _mm_xor_si128(a, cffff); ++ a_neg = _mm_and_si128(a_mask, a_neg); ++ a_pos = _mm_andnot_si128(a_mask, a); ++ a_comb = _mm_or_si128(a_pos, a_neg); ++ a_comb = vclzq_s16(a_comb); ++ return _mm_sub_epi16(a_comb, c1); ++} ++ ++int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 ++_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a) ++{ ++ __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb; ++ cffffffff = _mm_cmpeq_epi32(a,a); ++ c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000 ++ c1 = _mm_srli_epi32(cffffffff,31); //0x1 ++ a_mask = _mm_and_si128(a, c80000000); ++ a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive ++ a_neg = _mm_xor_si128(a, cffffffff); ++ a_neg = _mm_and_si128(a_mask, a_neg); ++ a_pos = _mm_andnot_si128(a_mask, a); ++ a_comb = _mm_or_si128(a_pos, a_neg); ++ a_comb = vclzq_s32(a_comb); ++ return _mm_sub_epi32(a_comb, c1); ++} ++ ++//************************* Count number of set bits ******************************** ++//************************************************************************************* ++//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element ++//another option is to do the following algorithm: ++ ++uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 ++_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a) ++{ ++ uint8x8_t res64; ++ __m128i res; ++ res = vcntq_u8(_pM128i(a)); ++ return64(res); ++} ++ ++int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 ++#define vcnt_s8 vcnt_u8 ++ ++poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 ++#define vcnt_p8 vcnt_u8 ++ ++uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 ++_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a) ++{ ++ _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2, ++ /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3, ++ /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3, ++ /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 }; ++ __m128i maskLOW, mask, lowpopcnt, hipopcnt; ++ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set ++ mask = _mm_and_si128(a, maskLOW); ++ lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway ++ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits ++ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set ++ hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway ++ return _mm_add_epi8(lowpopcnt, hipopcnt); ++} ++ ++int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 ++#define vcntq_s8 vcntq_u8 ++ ++poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 ++#define vcntq_p8 vcntq_u8 ++ ++//************************************************************************************** ++//*********************** Logical operations **************************************** ++//************************************************************************************** ++//************************** Bitwise not *********************************** ++//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance ++int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 ++_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = vmvnq_s8(_pM128i(a)); ++ return64(res); ++} ++ ++int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 ++_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a) ++{ ++ int16x4_t res64; ++ __m128i res; ++ res = vmvnq_s16(_pM128i(a)); ++ return64(res); ++} ++ ++int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 ++_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a) ++{ ++ int32x2_t res64; ++ __m128i res; ++ res = vmvnq_s32(_pM128i(a)); ++ return64(res); ++} ++ ++uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 ++#define vmvn_u8 vmvn_s8 ++ ++uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 ++#define vmvn_u16 vmvn_s16 ++ ++uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 ++#define vmvn_u32 vmvn_s32 ++ ++poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 ++#define vmvn_p8 vmvn_u8 ++ ++int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 ++_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0 ++{ ++ __m128i c1; ++ c1 = _mm_cmpeq_epi8 (a,a); //0xff ++ return _mm_andnot_si128 (a, c1); ++} ++ ++int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 ++_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0 ++{ ++ __m128i c1; ++ c1 = _mm_cmpeq_epi16 (a,a); //0xffff ++ return _mm_andnot_si128 (a, c1); ++} ++ ++int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 ++_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0 ++{ ++ __m128i c1; ++ c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff ++ return _mm_andnot_si128 (a, c1); ++} ++ ++uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 ++#define vmvnq_u8 vmvnq_s8 ++ ++uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 ++#define vmvnq_u16 vmvnq_s16 ++ ++uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 ++#define vmvnq_u32 vmvnq_s32 ++ ++poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 ++#define vmvnq_p8 vmvnq_u8 ++ ++//****************** Bitwise and *********************** ++//****************************************************** ++int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); ++} ++ ++int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); ++} ++ ++int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0]; ++ return res; ++} ++ ++uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 ++#define vand_u8 vand_s8 ++ ++uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 ++#define vand_u16 vand_s16 ++ ++uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 ++#define vand_u32 vand_s32 ++ ++uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 ++#define vand_u64 vand_s64 ++ ++ ++int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 ++#define vandq_s8 _mm_and_si128 ++ ++int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 ++#define vandq_s16 _mm_and_si128 ++ ++int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 ++#define vandq_s32 _mm_and_si128 ++ ++int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 ++#define vandq_s64 _mm_and_si128 ++ ++uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 ++#define vandq_u8 _mm_and_si128 ++ ++uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 ++#define vandq_u16 _mm_and_si128 ++ ++uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 ++#define vandq_u32 _mm_and_si128 ++ ++uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 ++#define vandq_u64 _mm_and_si128 ++ ++//******************** Bitwise or ********************************* ++//****************************************************************** ++int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); ++} ++ ++ ++int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0]; ++ return res; ++} ++ ++uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 ++#define vorr_u8 vorr_s8 ++ ++uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 ++#define vorr_u16 vorr_s16 ++ ++uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 ++#define vorr_u32 vorr_s32 ++ ++uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 ++#define vorr_u64 vorr_s64 ++ ++int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 ++#define vorrq_s8 _mm_or_si128 ++ ++int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 ++#define vorrq_s16 _mm_or_si128 ++ ++int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 ++#define vorrq_s32 _mm_or_si128 ++ ++int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 ++#define vorrq_s64 _mm_or_si128 ++ ++uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 ++#define vorrq_u8 _mm_or_si128 ++ ++uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 ++#define vorrq_u16 _mm_or_si128 ++ ++uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 ++#define vorrq_u32 _mm_or_si128 ++ ++uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 ++#define vorrq_u64 _mm_or_si128 ++ ++//************* Bitwise exclusive or (EOR or XOR) ****************** ++//******************************************************************* ++int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_xor_si128(_pM128i(a),_pM128i(b))); ++} ++ ++int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 ++#define veor_s16 veor_s8 ++ ++int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 ++#define veor_s32 veor_s8 ++ ++int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0]; ++ return res; ++} ++ ++uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 ++#define veor_u8 veor_s8 ++ ++uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 ++#define veor_u16 veor_s16 ++ ++uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 ++#define veor_u32 veor_s32 ++ ++uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 ++#define veor_u64 veor_s64 ++ ++int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 ++#define veorq_s8 _mm_xor_si128 ++ ++int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 ++#define veorq_s16 _mm_xor_si128 ++ ++int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 ++#define veorq_s32 _mm_xor_si128 ++ ++int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 ++#define veorq_s64 _mm_xor_si128 ++ ++uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 ++#define veorq_u8 _mm_xor_si128 ++ ++uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 ++#define veorq_u16 _mm_xor_si128 ++ ++uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 ++#define veorq_u32 _mm_xor_si128 ++ ++uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 ++#define veorq_u64 _mm_xor_si128 ++ ++//********************** Bit Clear ********************************** ++//******************************************************************* ++//Logical AND complement (AND negation or AND NOT) ++int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap" ++} ++ ++int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 ++#define vbic_s16 vbic_s8 ++ ++int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 ++#define vbic_s32 vbic_s8 ++ ++int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]); ++ return res; ++} ++ ++uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 ++#define vbic_u8 vbic_s8 ++ ++uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 ++#define vbic_u16 vbic_s16 ++ ++uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 ++#define vbic_u32 vbic_s32 ++ ++uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 ++#define vbic_u64 vbic_s64 ++ ++int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 ++#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 ++#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 ++#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 ++#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 ++#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 ++#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 ++#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 ++#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" ++ ++//**************** Bitwise OR complement ******************************** ++//**************************************** ******************************** ++//no exact IA 32 match, need to implement it as following ++int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b) ++{ ++ int8x8_t res64; ++ return64(vornq_s8(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b) ++{ ++ int16x4_t res64; ++ return64(vornq_s16(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2_t res64; ++ return64(vornq_s32(_pM128i(a), _pM128i(b))); ++} ++ ++ ++int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]); ++ return res; ++} ++ ++uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 ++#define vorn_u8 vorn_s8 ++ ++ ++uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 ++#define vorn_u16 vorn_s16 ++ ++uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 ++#define vorn_u32 vorn_s32 ++ ++uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 ++#define vorn_u64 vorn_s64 ++ ++ ++int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s8( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s16( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s32( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b) ++{ ++ __m128i c1, b1; ++ c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff ++ b1 = _mm_andnot_si128 (b, c1); ++ return _mm_or_si128 (a, b1); ++} ++ ++uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_u8( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_s16( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++ ++uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 ++_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0 ++{ ++ __m128i b1; ++ b1 = vmvnq_u32( b); //bitwise not for b ++ return _mm_or_si128 (a, b1); ++} ++uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 ++#define vornq_u64 vornq_s64 ++ ++//********************* Bitwise Select ***************************** ++//****************************************************************** ++//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????) ++ ++//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the ++//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0. ++ ++//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination ++//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged ++ ++//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination ++//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged. ++ ++//VBSL only is implemented for SIMD ++int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 ++_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) ++{ ++ int8x8_t res64; ++ __m128i res; ++ res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c)); ++ return64(res); ++} ++ ++int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 ++#define vbsl_s16 vbsl_s8 ++ ++int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 ++#define vbsl_s32 vbsl_s8 ++ ++int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 ++_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) ++{ ++ int64x1_t res; ++ res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]); ++ return res; ++} ++ ++uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 ++#define vbsl_u8 vbsl_s8 ++ ++uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 ++#define vbsl_u16 vbsl_s8 ++ ++uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 ++#define vbsl_u32 vbsl_s8 ++ ++uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 ++#define vbsl_u64 vbsl_s64 ++ ++float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 ++_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) ++{ ++ __m128 sel1, sel2; ++ __m64_128 res64; ++ sel1 = _mm_and_ps (_pM128(a), _pM128(b)); ++ sel2 = _mm_andnot_ps (_pM128(a), _pM128(c)); ++ sel1 = _mm_or_ps (sel1, sel2); ++ _M64f(res64, sel1); ++ return res64; ++} ++ ++poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 ++#define vbsl_p8 vbsl_s8 ++ ++poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 ++#define vbsl_p16 vbsl_s8 ++ ++int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 ++_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0 ++{ ++ __m128i sel1, sel2; ++ sel1 = _mm_and_si128 (a, b); ++ sel2 = _mm_andnot_si128 (a, c); ++ return _mm_or_si128 (sel1, sel2); ++} ++ ++int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 ++#define vbslq_s16 vbslq_s8 ++ ++int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 ++#define vbslq_s32 vbslq_s8 ++ ++int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 ++#define vbslq_s64 vbslq_s8 ++ ++uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 ++#define vbslq_u8 vbslq_s8 ++ ++uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 ++#define vbslq_u16 vbslq_s8 ++ ++uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 ++#define vbslq_u32 vbslq_s8 ++ ++uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 ++#define vbslq_u64 vbslq_s8 ++ ++float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 ++_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0 ++{ ++ __m128 sel1, sel2; ++ sel1 = _mm_and_ps (*(__m128*)&a, b); ++ sel2 = _mm_andnot_ps (*(__m128*)&a, c); ++ return _mm_or_ps (sel1, sel2); ++} ++ ++poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 ++#define vbslq_p8 vbslq_u8 ++ ++poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 ++#define vbslq_p16 vbslq_s8 ++ ++//************************************************************************************ ++//**************** Transposition operations **************************************** ++//************************************************************************************ ++//***************** Vector Transpose ************************************************ ++//************************************************************************************ ++//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices. ++// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....) ++int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 ++_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0 ++{ ++ int8x8x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp ++ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7) ++ vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6), ++ return val; ++} ++ ++int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 ++_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0 ++{ ++ int16x4x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15}; ++ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3 ++ vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2), ++ return val; ++} ++ ++int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 ++_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b) ++{ ++ int32x2x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1 ++ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0, ++ return val; ++} ++ ++uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 ++#define vtrn_u8 vtrn_s8 ++ ++uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 ++#define vtrn_u16 vtrn_s16 ++ ++uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 ++#define vtrn_u32 vtrn_s32 ++ ++float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 ++_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b) ++{ ++ float32x2x2_t val; ++ val.val[0].m64_f32[0] = a.m64_f32[0]; ++ val.val[0].m64_f32[1] = b.m64_f32[0]; ++ val.val[1].m64_f32[0] = a.m64_f32[1]; ++ val.val[1].m64_f32[1] = b.m64_f32[1]; ++ return val; //a0,b0,a1,b1 ++} ++ ++poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 ++#define vtrn_p8 vtrn_u8 ++ ++poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 ++#define vtrn_p16 vtrn_s16 ++ ++//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 ++_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0 ++{ ++ int8x16x2_t r8x16; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 ++ ++ r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14) ++ r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15) ++ return r8x16; ++} ++ ++int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 ++_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0 ++{ ++ int16x8x2_t v16x8; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 ++ v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6 ++ v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7 ++ return v16x8; ++} ++ ++int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 ++_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0 ++{ ++ //may be not optimal solution compared with serial ++ int32x4x2_t v32x4; ++ __m128i a_sh, b_sh; ++ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 ++ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 ++ ++ v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2 ++ v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3 ++ return v32x4; ++} ++ ++uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 ++#define vtrnq_u8 vtrnq_s8 ++ ++uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 ++#define vtrnq_u16 vtrnq_s16 ++ ++uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 ++#define vtrnq_u32 vtrnq_s32 ++ ++float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 ++_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0 ++{ ++ //may be not optimal solution compared with serial ++ float32x4x2_t f32x4; ++ __m128 a_sh, b_sh; ++ a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness ++ b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness ++ ++ f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2 ++ f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3 ++ return f32x4; ++} ++ ++poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 ++#define vtrnq_p8 vtrnq_s8 ++ ++poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 ++#define vtrnq_p16 vtrnq_s16 ++ ++//***************** Interleave elements *************************** ++//***************************************************************** ++//output has (a0,b0,a1,b1, a2,b2,.....) ++int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 ++_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0 ++{ ++ int8x8x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); ++ vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 ++_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0 ++{ ++ int16x4x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); ++ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 ++#define vzip_s32 vtrn_s32 ++ ++uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 ++#define vzip_u8 vzip_s8 ++ ++uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 ++#define vzip_u16 vzip_s16 ++ ++uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 ++#define vzip_u32 vzip_s32 ++ ++float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 ++#define vzip_f32 vtrn_f32 ++ ++poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 ++#define vzip_p8 vzip_u8 ++ ++poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 ++#define vzip_p16 vzip_u16 ++ ++int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 ++_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0 ++{ ++ int8x16x2_t r8x16; ++ r8x16.val[0] = _mm_unpacklo_epi8(a, b); ++ r8x16.val[1] = _mm_unpackhi_epi8(a, b); ++ return r8x16; ++} ++ ++int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 ++_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0 ++{ ++ int16x8x2_t r16x8; ++ r16x8.val[0] = _mm_unpacklo_epi16(a, b); ++ r16x8.val[1] = _mm_unpackhi_epi16(a, b); ++ return r16x8; ++} ++ ++int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 ++_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0 ++{ ++ int32x4x2_t r32x4; ++ r32x4.val[0] = _mm_unpacklo_epi32(a, b); ++ r32x4.val[1] = _mm_unpackhi_epi32(a, b); ++ return r32x4; ++} ++ ++uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 ++#define vzipq_u8 vzipq_s8 ++ ++uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 ++#define vzipq_u16 vzipq_s16 ++ ++uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 ++#define vzipq_u32 vzipq_s32 ++ ++float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 ++_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0 ++{ ++ float32x4x2_t f32x4; ++ f32x4.val[0] = _mm_unpacklo_ps ( a, b); ++ f32x4.val[1] = _mm_unpackhi_ps ( a, b); ++ return f32x4; ++} ++ ++poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 ++#define vzipq_p8 vzipq_u8 ++ ++poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 ++#define vzipq_p16 vzipq_u16 ++ ++//*********************** De-Interleave elements ************************* ++//************************************************************************* ++//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...) ++//no such functions in IA32 SIMD, shuffle is required ++int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 ++_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0 ++{ ++ int8x8x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15}; ++ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7) ++ vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 ++_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0 ++{ ++ int16x4x2_t val; ++ __m128i tmp, val0; ++ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; ++ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 ++ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3 ++ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 ++_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0 ++{ ++ int32x2x2_t val; ++ __m128i val0; ++ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1 ++ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); ++ return val; ++} ++ ++uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 ++#define vuzp_u8 vuzp_s8 ++ ++uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 ++#define vuzp_u16 vuzp_s16 ++ ++uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 ++#define vuzp_u32 vuzp_s32 ++ ++float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 ++#define vuzp_f32 vzip_f32 ++ ++poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 ++#define vuzp_p8 vuzp_u8 ++ ++poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 ++#define vuzp_p16 vuzp_u16 ++ ++int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 ++_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0 ++{ ++ int8x16x2_t v8x16; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 ++ //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b ++ v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14, ++ v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15 ++ return v8x16; ++} ++ ++int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 ++_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0 ++{ ++ int16x8x2_t v16x8; ++ __m128i a_sh, b_sh; ++ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; ++ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 ++ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 ++ v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6 ++ v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7 ++ return v16x8; ++} ++ ++int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 ++_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0 ++{ ++ //may be not optimal solution compared with serial ++ int32x4x2_t v32x4; ++ __m128i a_sh, b_sh; ++ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 ++ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 ++ ++ v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2 ++ v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3 ++ return v32x4; ++} ++ ++uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 ++#define vuzpq_u8 vuzpq_s8 ++ ++uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 ++#define vuzpq_u16 vuzpq_s16 ++ ++uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 ++#define vuzpq_u32 vuzpq_s32 ++ ++float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 ++_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0 ++{ ++ float32x4x2_t v32x4; ++ v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however ++ v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however ++ return v32x4; ++} ++ ++poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 ++#define vuzpq_p8 vuzpq_u8 ++ ++poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 ++#define vuzpq_p16 vuzpq_u16 ++ ++//############################################################################################## ++//*********************** Reinterpret cast intrinsics.****************************************** ++//############################################################################################## ++// Not a part of oficial NEON instruction set but available in gcc compiler ********************* ++poly8x8_t vreinterpret_p8_u32 (uint32x2_t t); ++#define vreinterpret_p8_u32 ++ ++poly8x8_t vreinterpret_p8_u16 (uint16x4_t t); ++#define vreinterpret_p8_u16 ++ ++poly8x8_t vreinterpret_p8_u8 (uint8x8_t t); ++#define vreinterpret_p8_u8 ++ ++poly8x8_t vreinterpret_p8_s32 (int32x2_t t); ++#define vreinterpret_p8_s32 ++ ++poly8x8_t vreinterpret_p8_s16 (int16x4_t t); ++#define vreinterpret_p8_s16 ++ ++poly8x8_t vreinterpret_p8_s8 (int8x8_t t); ++#define vreinterpret_p8_s8 ++ ++poly8x8_t vreinterpret_p8_u64 (uint64x1_t t); ++#define vreinterpret_p8_u64 ++ ++poly8x8_t vreinterpret_p8_s64 (int64x1_t t); ++#define vreinterpret_p8_s64 ++ ++poly8x8_t vreinterpret_p8_f32 (float32x2_t t); ++#define vreinterpret_p8_f32 ++ ++poly8x8_t vreinterpret_p8_p16 (poly16x4_t t); ++#define vreinterpret_p8_p16 ++ ++poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t); ++#define vreinterpretq_p8_u32 ++ ++poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t); ++#define vreinterpretq_p8_u16 ++ ++poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t); ++#define vreinterpretq_p8_u8 ++ ++poly8x16_t vreinterpretq_p8_s32 (int32x4_t t); ++#define vreinterpretq_p8_s32 ++ ++poly8x16_t vreinterpretq_p8_s16 (int16x8_t t); ++#define vreinterpretq_p8_s16 ++ ++poly8x16_t vreinterpretq_p8_s8 (int8x16_t t); ++#define vreinterpretq_p8_s8 ++ ++poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t); ++#define vreinterpretq_p8_u64 ++ ++poly8x16_t vreinterpretq_p8_s64 (int64x2_t t); ++#define vreinterpretq_p8_s64 ++ ++poly8x16_t vreinterpretq_p8_f32 (float32x4_t t); ++#define vreinterpretq_p8_f32(t) _M128i(t) ++ ++poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t); ++#define vreinterpretq_p8_p16 ++ ++poly16x4_t vreinterpret_p16_u32 (uint32x2_t t); ++#define vreinterpret_p16_u32 ++ ++poly16x4_t vreinterpret_p16_u16 (uint16x4_t t); ++#define vreinterpret_p16_u16 ++ ++poly16x4_t vreinterpret_p16_u8 (uint8x8_t t); ++#define vreinterpret_p16_u8 ++ ++poly16x4_t vreinterpret_p16_s32 (int32x2_t t); ++#define vreinterpret_p16_s32 ++ ++poly16x4_t vreinterpret_p16_s16 (int16x4_t t); ++#define vreinterpret_p16_s16 ++ ++poly16x4_t vreinterpret_p16_s8 (int8x8_t t); ++#define vreinterpret_p16_s8 ++ ++poly16x4_t vreinterpret_p16_u64 (uint64x1_t t); ++#define vreinterpret_p16_u64 ++ ++poly16x4_t vreinterpret_p16_s64 (int64x1_t t); ++#define vreinterpret_p16_s64 ++ ++poly16x4_t vreinterpret_p16_f32 (float32x2_t t); ++#define vreinterpret_p16_f32 ++ ++poly16x4_t vreinterpret_p16_p8 (poly8x8_t t); ++#define vreinterpret_p16_p8 ++ ++poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t); ++#define vreinterpretq_p16_u32 ++ ++poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t); ++#define vreinterpretq_p16_u16 ++ ++poly16x8_t vreinterpretq_p16_s32 (int32x4_t t); ++#define vreinterpretq_p16_s32 ++ ++poly16x8_t vreinterpretq_p16_s16 (int16x8_t t); ++#define vreinterpretq_p16_s16 ++ ++poly16x8_t vreinterpretq_p16_s8 (int8x16_t t); ++#define vreinterpretq_p16_s8 ++ ++poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t); ++#define vreinterpretq_p16_u64 ++ ++poly16x8_t vreinterpretq_p16_s64 (int64x2_t t); ++#define vreinterpretq_p16_s64 ++ ++poly16x8_t vreinterpretq_p16_f32 (float32x4_t t); ++#define vreinterpretq_p16_f32(t) _M128i(t) ++ ++poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t); ++#define vreinterpretq_p16_p8 vreinterpretq_s16_p8 ++ ++//**** Integer to float ****** ++float32x2_t vreinterpret_f32_u32 (uint32x2_t t); ++#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t)) ++ ++ ++float32x2_t vreinterpret_f32_u16 (uint16x4_t t); ++#define vreinterpret_f32_u16 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_u8 (uint8x8_t t); ++#define vreinterpret_f32_u8 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_s32 (int32x2_t t); ++#define vreinterpret_f32_s32 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_s16 (int16x4_t t); ++#define vreinterpret_f32_s16 vreinterpret_f32_u32 ++ ++float32x2_t vreinterpret_f32_s8 (int8x8_t t); ++#define vreinterpret_f32_s8 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_u64(uint64x1_t t); ++#define vreinterpret_f32_u64 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_s64 (int64x1_t t); ++#define vreinterpret_f32_s64 vreinterpret_f32_u32 ++ ++ ++float32x2_t vreinterpret_f32_p16 (poly16x4_t t); ++#define vreinterpret_f32_p16 vreinterpret_f32_u32 ++ ++float32x2_t vreinterpret_f32_p8 (poly8x8_t t); ++#define vreinterpret_f32_p8 vreinterpret_f32_u32 ++ ++float32x4_t vreinterpretq_f32_u32 (uint32x4_t t); ++#define vreinterpretq_f32_u32(t) *(__m128*)&(t) ++ ++float32x4_t vreinterpretq_f32_u16 (uint16x8_t t); ++#define vreinterpretq_f32_u16 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_u8 (uint8x16_t t); ++#define vreinterpretq_f32_u8 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s32 (int32x4_t t); ++#define vreinterpretq_f32_s32 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s16 (int16x8_t t); ++#define vreinterpretq_f32_s16 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s8 (int8x16_t t); ++#define vreinterpretq_f32_s8 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_u64 (uint64x2_t t); ++#define vreinterpretq_f32_u64 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_s64 (int64x2_t t); ++#define vreinterpretq_f32_s64 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_p16 (poly16x8_t t); ++#define vreinterpretq_f32_p16 vreinterpretq_f32_u32 ++ ++float32x4_t vreinterpretq_f32_p8 (poly8x16_t t); ++#define vreinterpretq_f32_p8 vreinterpretq_f32_u32 ++ ++//*** Integer type conversions ****************** ++//no conversion necessary for the following functions because it is same data type ++int64x1_t vreinterpret_s64_u32 (uint32x2_t t); ++#define vreinterpret_s64_u32 ++ ++int64x1_t vreinterpret_s64_u16 (uint16x4_t t); ++#define vreinterpret_s64_u16 ++ ++int64x1_t vreinterpret_s64_u8 (uint8x8_t t); ++#define vreinterpret_s64_u8 ++ ++int64x1_t vreinterpret_s64_s32 (int32x2_t t); ++#define vreinterpret_s64_s32 ++ ++int64x1_t vreinterpret_s64_s16 (int16x4_t t); ++#define vreinterpret_s64_s16 ++ ++int64x1_t vreinterpret_s64_s8 (int8x8_t t); ++#define vreinterpret_s64_s8 ++ ++int64x1_t vreinterpret_s64_u64 (uint64x1_t t); ++#define vreinterpret_s64_u64 ++ ++int64x1_t vreinterpret_s64_f32 (float32x2_t t); ++#define vreinterpret_s64_f32 ++ ++int64x1_t vreinterpret_s64_p16 (poly16x4_t t); ++#define vreinterpret_s64_p16 ++ ++int64x1_t vreinterpret_s64_p8 (poly8x8_t t); ++#define vreinterpret_s64_p8 ++ ++int64x2_t vreinterpretq_s64_u32 (uint32x4_t t); ++#define vreinterpretq_s64_u32 ++ ++int64x2_t vreinterpretq_s64_s16 (uint16x8_t t); ++#define vreinterpretq_s64_s16 ++ ++int64x2_t vreinterpretq_s64_u8 (uint8x16_t t); ++#define vreinterpretq_s64_u8 ++ ++int64x2_t vreinterpretq_s64_s32 (int32x4_t t); ++#define vreinterpretq_s64_s32 ++ ++int64x2_t vreinterpretq_s64_u16 (int16x8_t t); ++#define vreinterpretq_s64_u16 ++ ++int64x2_t vreinterpretq_s64_s8 (int8x16_t t); ++#define vreinterpretq_s64_s8 ++ ++int64x2_t vreinterpretq_s64_u64 (uint64x2_t t); ++#define vreinterpretq_s64_u64 ++ ++int64x2_t vreinterpretq_s64_f32 (float32x4_t t); ++#define vreinterpretq_s64_f32(t) _M128i(t) ++ ++int64x2_t vreinterpretq_s64_p16 (poly16x8_t t); ++#define vreinterpretq_s64_p16 ++ ++int64x2_t vreinterpretq_s64_p8 (poly8x16_t t); ++#define vreinterpretq_s64_p8 ++ ++uint64x1_t vreinterpret_u64_u32 (uint32x2_t t); ++#define vreinterpret_u64_u32 ++ ++uint64x1_t vreinterpret_u64_u16 (uint16x4_t t); ++#define vreinterpret_u64_u16 ++ ++uint64x1_t vreinterpret_u64_u8 (uint8x8_t t); ++#define vreinterpret_u64_u8 ++ ++uint64x1_t vreinterpret_u64_s32 (int32x2_t t); ++#define vreinterpret_u64_s32 ++ ++uint64x1_t vreinterpret_u64_s16 (int16x4_t t); ++#define vreinterpret_u64_s16 ++ ++uint64x1_t vreinterpret_u64_s8 (int8x8_t t); ++#define vreinterpret_u64_s8 ++ ++uint64x1_t vreinterpret_u64_s64 (int64x1_t t); ++#define vreinterpret_u64_s64 ++ ++uint64x1_t vreinterpret_u64_f32 (float32x2_t t); ++#define vreinterpret_u64_f32 ++ ++uint64x1_t vreinterpret_u64_p16 (poly16x4_t t); ++#define vreinterpret_u64_p16 ++ ++uint64x1_t vreinterpret_u64_p8 (poly8x8_t t); ++#define vreinterpret_u64_p8 ++ ++uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t); ++#define vreinterpretq_u64_u32 ++ ++uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t); ++#define vreinterpretq_u64_u16 ++ ++uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t); ++#define vreinterpretq_u64_u8 ++ ++uint64x2_t vreinterpretq_u64_s32 (int32x4_t t); ++#define vreinterpretq_u64_s32 ++ ++uint64x2_t vreinterpretq_u64_s16 (int16x8_t t); ++#define vreinterpretq_u64_s16 ++ ++uint64x2_t vreinterpretq_u64_s8 (int8x16_t t); ++#define vreinterpretq_u64_s8 ++ ++uint64x2_t vreinterpretq_u64_s64 (int64x2_t t); ++#define vreinterpretq_u64_s64 ++ ++uint64x2_t vreinterpretq_u64_f32 (float32x4_t t); ++#define vreinterpretq_u64_f32(t) _M128i(t) ++ ++uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t); ++#define vreinterpretq_u64_p16 ++ ++uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t); ++#define vreinterpretq_u64_p8 ++ ++int8x8_t vreinterpret_s8_u32 (uint32x2_t t); ++#define vreinterpret_s8_u32 ++ ++int8x8_t vreinterpret_s8_u16 (uint16x4_t t); ++#define vreinterpret_s8_u16 ++ ++int8x8_t vreinterpret_s8_u8 (uint8x8_t t); ++#define vreinterpret_s8_u8 ++ ++int8x8_t vreinterpret_s8_s32 (int32x2_t t); ++#define vreinterpret_s8_s32 ++ ++int8x8_t vreinterpret_s8_s16 (int16x4_t t); ++#define vreinterpret_s8_s16 ++ ++int8x8_t vreinterpret_s8_u64 (uint64x1_t t); ++#define vreinterpret_s8_u64 ++ ++int8x8_t vreinterpret_s8_s64 (int64x1_t t); ++#define vreinterpret_s8_s64 ++ ++int8x8_t vreinterpret_s8_f32 (float32x2_t t); ++#define vreinterpret_s8_f32 ++ ++int8x8_t vreinterpret_s8_p16 (poly16x4_t t); ++#define vreinterpret_s8_p16 ++ ++int8x8_t vreinterpret_s8_p8 (poly8x8_t t); ++#define vreinterpret_s8_p8 ++ ++int8x16_t vreinterpretq_s8_u32 (uint32x4_t t); ++#define vreinterpretq_s8_u32 ++ ++int8x16_t vreinterpretq_s8_u16 (uint16x8_t t); ++#define vreinterpretq_s8_u16 ++ ++int8x16_t vreinterpretq_s8_u8 (uint8x16_t t); ++#define vreinterpretq_s8_u8 ++ ++int8x16_t vreinterpretq_s8_s32 (int32x4_t t); ++#define vreinterpretq_s8_s32 ++ ++int8x16_t vreinterpretq_s8_s16 (int16x8_t t); ++#define vreinterpretq_s8_s16 ++ ++int8x16_t vreinterpretq_s8_u64 (uint64x2_t t); ++#define vreinterpretq_s8_u64 ++ ++int8x16_t vreinterpretq_s8_s64 (int64x2_t t); ++#define vreinterpretq_s8_s64 ++ ++int8x16_t vreinterpretq_s8_f32 (float32x4_t t); ++#define vreinterpretq_s8_f32(t) _M128i(t) ++ ++int8x16_t vreinterpretq_s8_p16 (poly16x8_t t); ++#define vreinterpretq_s8_p16 ++ ++int8x16_t vreinterpretq_s8_p8 (poly8x16_t t); ++#define vreinterpretq_s8_p8 ++ ++int16x4_t vreinterpret_s16_u32 (uint32x2_t t); ++#define vreinterpret_s16_u32 ++ ++int16x4_t vreinterpret_s16_u16 (uint16x4_t t); ++#define vreinterpret_s16_u16 ++ ++int16x4_t vreinterpret_s16_u8 (uint8x8_t t); ++#define vreinterpret_s16_u8 ++ ++int16x4_t vreinterpret_s16_s32 (int32x2_t t); ++#define vreinterpret_s16_s32 ++ ++int16x4_t vreinterpret_s16_s8 (int8x8_t t); ++#define vreinterpret_s16_s8 ++ ++int16x4_t vreinterpret_s16_u64 (uint64x1_t t); ++#define vreinterpret_s16_u64 ++ ++int16x4_t vreinterpret_s16_s64 (int64x1_t t); ++#define vreinterpret_s16_s64 ++ ++int16x4_t vreinterpret_s16_f32 (float32x2_t t); ++#define vreinterpret_s16_f32 ++ ++ ++int16x4_t vreinterpret_s16_p16 (poly16x4_t t); ++#define vreinterpret_s16_p16 ++ ++int16x4_t vreinterpret_s16_p8 (poly8x8_t t); ++#define vreinterpret_s16_p8 ++ ++int16x8_t vreinterpretq_s16_u32 (uint32x4_t t); ++#define vreinterpretq_s16_u32 ++ ++int16x8_t vreinterpretq_s16_u16 (uint16x8_t t); ++#define vreinterpretq_s16_u16 ++ ++int16x8_t vreinterpretq_s16_u8 (uint8x16_t t); ++#define vreinterpretq_s16_u8 ++ ++int16x8_t vreinterpretq_s16_s32 (int32x4_t t); ++#define vreinterpretq_s16_s32 ++ ++int16x8_t vreinterpretq_s16_s8 (int8x16_t t); ++#define vreinterpretq_s16_s8 ++ ++int16x8_t vreinterpretq_s16_u64 (uint64x2_t t); ++#define vreinterpretq_s16_u64 ++ ++int16x8_t vreinterpretq_s16_s64 (int64x2_t t); ++#define vreinterpretq_s16_s64 ++ ++int16x8_t vreinterpretq_s16_f32 (float32x4_t t); ++#define vreinterpretq_s16_f32(t) _M128i(t) ++ ++int16x8_t vreinterpretq_s16_p16 (poly16x8_t t); ++#define vreinterpretq_s16_p16 ++ ++int16x8_t vreinterpretq_s16_p8 (poly8x16_t t); ++#define vreinterpretq_s16_p8 ++ ++int32x2_t vreinterpret_s32_u32 (uint32x2_t t); ++#define vreinterpret_s32_u32 ++ ++int32x2_t vreinterpret_s32_u16 (uint16x4_t t); ++#define vreinterpret_s32_u16 ++ ++int32x2_t vreinterpret_s32_u8 (uint8x8_t t); ++#define vreinterpret_s32_u8 ++ ++int32x2_t vreinterpret_s32_s16 (int16x4_t t); ++#define vreinterpret_s32_s16 ++ ++int32x2_t vreinterpret_s32_s8 (int8x8_t t); ++#define vreinterpret_s32_s8 ++ ++int32x2_t vreinterpret_s32_u64 (uint64x1_t t); ++#define vreinterpret_s32_u64 ++ ++int32x2_t vreinterpret_s32_s64 (int64x1_t t); ++#define vreinterpret_s32_s64 ++ ++int32x2_t vreinterpret_s32_f32 (float32x2_t t); ++#define vreinterpret_s32_f32 ++ ++int32x2_t vreinterpret_s32_p16 (poly16x4_t t); ++#define vreinterpret_s32_p16 ++ ++int32x2_t vreinterpret_s32_p8 (poly8x8_t t); ++#define vreinterpret_s32_p8 ++ ++int32x4_t vreinterpretq_s32_u32 (uint32x4_t t); ++#define vreinterpretq_s32_u32 ++ ++int32x4_t vreinterpretq_s32_u16 (uint16x8_t t); ++#define vreinterpretq_s32_u16 ++ ++int32x4_t vreinterpretq_s32_u8 (uint8x16_t t); ++#define vreinterpretq_s32_u8 ++ ++int32x4_t vreinterpretq_s32_s16 (int16x8_t t); ++#define vreinterpretq_s32_s16 ++ ++int32x4_t vreinterpretq_s32_s8 (int8x16_t t); ++#define vreinterpretq_s32_s8 ++ ++int32x4_t vreinterpretq_s32_u64 (uint64x2_t t); ++#define vreinterpretq_s32_u64 ++ ++int32x4_t vreinterpretq_s32_s64 (int64x2_t t); ++#define vreinterpretq_s32_s64 ++ ++int32x4_t vreinterpretq_s32_f32 (float32x4_t t); ++#define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t)) ++ ++int32x4_t vreinterpretq_s32_p16 (poly16x8_t t); ++#define vreinterpretq_s32_p16 ++ ++int32x4_t vreinterpretq_s32_p8 (poly8x16_t t); ++#define vreinterpretq_s32_p8 ++ ++uint8x8_t vreinterpret_u8_u32 (uint32x2_t t); ++#define vreinterpret_u8_u32 ++ ++uint8x8_t vreinterpret_u8_u16 (uint16x4_t t); ++#define vreinterpret_u8_u16 ++ ++uint8x8_t vreinterpret_u8_s32 (int32x2_t t); ++#define vreinterpret_u8_s32 ++ ++uint8x8_t vreinterpret_u8_s16 (int16x4_t t); ++#define vreinterpret_u8_s16 ++ ++uint8x8_t vreinterpret_u8_s8 (int8x8_t t); ++#define vreinterpret_u8_s8 ++ ++uint8x8_t vreinterpret_u8_u64 (uint64x1_t t); ++#define vreinterpret_u8_u64 ++ ++uint8x8_t vreinterpret_u8_s64 (int64x1_t t); ++#define vreinterpret_u8_s64 ++ ++uint8x8_t vreinterpret_u8_f32 (float32x2_t t); ++#define vreinterpret_u8_f32 ++ ++uint8x8_t vreinterpret_u8_p16 (poly16x4_t t); ++#define vreinterpret_u8_p16 ++ ++uint8x8_t vreinterpret_u8_p8 (poly8x8_t t); ++#define vreinterpret_u8_p8 ++ ++uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t); ++#define vreinterpretq_u8_u32 ++ ++uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t); ++#define vreinterpretq_u8_u16 ++ ++uint8x16_t vreinterpretq_u8_s32 (int32x4_t t); ++#define vreinterpretq_u8_s32 ++ ++uint8x16_t vreinterpretq_u8_s16 (int16x8_t t); ++#define vreinterpretq_u8_s16 ++ ++uint8x16_t vreinterpretq_u8_s8 (int8x16_t t); ++#define vreinterpretq_u8_s8 ++ ++uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t); ++#define vreinterpretq_u8_u64 ++ ++uint8x16_t vreinterpretq_u8_s64 (int64x2_t t); ++#define vreinterpretq_u8_s64 ++ ++uint8x16_t vreinterpretq_u8_f32 (float32x4_t t); ++#define vreinterpretq_u8_f32(t) _M128i(t) ++ ++ ++uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t); ++#define vreinterpretq_u8_p16 ++ ++uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t); ++#define vreinterpretq_u8_p8 ++ ++uint16x4_t vreinterpret_u16_u32 (uint32x2_t t); ++#define vreinterpret_u16_u32 ++ ++uint16x4_t vreinterpret_u16_u8 (uint8x8_t t); ++#define vreinterpret_u16_u8 ++ ++uint16x4_t vreinterpret_u16_s32 (int32x2_t t); ++#define vreinterpret_u16_s32 ++ ++uint16x4_t vreinterpret_u16_s16 (int16x4_t t); ++#define vreinterpret_u16_s16 ++ ++uint16x4_t vreinterpret_u16_s8 (int8x8_t t); ++#define vreinterpret_u16_s8 ++ ++uint16x4_t vreinterpret_u16_u64 (uint64x1_t t); ++#define vreinterpret_u16_u64 ++ ++uint16x4_t vreinterpret_u16_s64 (int64x1_t t); ++#define vreinterpret_u16_s64 ++ ++uint16x4_t vreinterpret_u16_f32 (float32x2_t t); ++#define vreinterpret_u16_f32 ++ ++uint16x4_t vreinterpret_u16_p16 (poly16x4_t t); ++#define vreinterpret_u16_p16 ++ ++uint16x4_t vreinterpret_u16_p8 (poly8x8_t t); ++#define vreinterpret_u16_p8 ++ ++uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t); ++#define vreinterpretq_u16_u32 ++ ++uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t); ++#define vreinterpretq_u16_u8 ++ ++uint16x8_t vreinterpretq_u16_s32 (int32x4_t t); ++#define vreinterpretq_u16_s32 ++ ++uint16x8_t vreinterpretq_u16_s16 (int16x8_t t); ++#define vreinterpretq_u16_s16 ++ ++uint16x8_t vreinterpretq_u16_s8 (int8x16_t t); ++#define vreinterpretq_u16_s8 ++ ++uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t); ++#define vreinterpretq_u16_u64 ++ ++uint16x8_t vreinterpretq_u16_s64 (int64x2_t t); ++#define vreinterpretq_u16_s64 ++ ++uint16x8_t vreinterpretq_u16_f32 (float32x4_t t); ++#define vreinterpretq_u16_f32(t) _M128i(t) ++ ++uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t); ++#define vreinterpretq_u16_p16 ++ ++uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t); ++#define vreinterpretq_u16_p8 ++ ++uint32x2_t vreinterpret_u32_u16 (uint16x4_t t); ++#define vreinterpret_u32_u16 ++ ++uint32x2_t vreinterpret_u32_u8 (uint8x8_t t); ++#define vreinterpret_u32_u8 ++ ++uint32x2_t vreinterpret_u32_s32 (int32x2_t t); ++#define vreinterpret_u32_s32 ++ ++uint32x2_t vreinterpret_u32_s16 (int16x4_t t); ++#define vreinterpret_u32_s16 ++ ++uint32x2_t vreinterpret_u32_s8 (int8x8_t t); ++#define vreinterpret_u32_s8 ++ ++uint32x2_t vreinterpret_u32_u64 (uint64x1_t t); ++#define vreinterpret_u32_u64 ++ ++uint32x2_t vreinterpret_u32_s64 (int64x1_t t); ++#define vreinterpret_u32_s64 ++ ++uint32x2_t vreinterpret_u32_f32 (float32x2_t t); ++#define vreinterpret_u32_f32 ++ ++uint32x2_t vreinterpret_u32_p16 (poly16x4_t t); ++#define vreinterpret_u32_p16 ++ ++uint32x2_t vreinterpret_u32_p8 (poly8x8_t t); ++#define vreinterpret_u32_p8 ++ ++uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t); ++#define vreinterpretq_u32_u16 ++ ++uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t); ++#define vreinterpretq_u32_u8 ++ ++uint32x4_t vreinterpretq_u32_s32 (int32x4_t t); ++#define vreinterpretq_u32_s32 ++ ++uint32x4_t vreinterpretq_u32_s16 (int16x8_t t); ++#define vreinterpretq_u32_s16 ++ ++uint32x4_t vreinterpretq_u32_s8 (int8x16_t t); ++#define vreinterpretq_u32_s8 ++ ++uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t); ++#define vreinterpretq_u32_u64 ++ ++uint32x4_t vreinterpretq_u32_s64 (int64x2_t t); ++#define vreinterpretq_u32_s64 ++ ++uint32x4_t vreinterpretq_u32_f32 (float32x4_t t); ++#define vreinterpretq_u32_f32(t) _M128i(t) ++ ++uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t); ++#define vreinterpretq_u32_p16 ++ ++uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); ++#define vreinterpretq_u32_p8 ++ ++#endif /* NEON2SSE_H */ +diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h +index fee33a3..22fb2ce 100644 +--- a/gcc/config/i386/gnu-user.h ++++ b/gcc/config/i386/gnu-user.h +@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see + When the -shared link option is used a final link is not being + done. */ + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ " -mssse3 -fno-short-enums " \ ++ + #undef ASM_SPEC + #define ASM_SPEC \ +- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" ++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + #undef SUBTARGET_EXTRA_SPECS + #define SUBTARGET_EXTRA_SPECS \ +diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h +index 7a02a7e..cac4179 100644 +--- a/gcc/config/i386/gnu-user64.h ++++ b/gcc/config/i386/gnu-user64.h +@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define SPEC_X32 "mx32" + #endif + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ "%{m32:-mssse3 -fno-short-enums}" \ ++ "%{!m32:-msse4.2 -mpopcnt}" ++ + #undef ASM_SPEC + #define ASM_SPEC "%{" SPEC_32 ":--32} \ + %{" SPEC_64 ":--64} \ +diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c +index 3d044e8..5c89fca 100644 +--- a/gcc/config/i386/i386.c ++++ b/gcc/config/i386/i386.c +@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp) + else if (!SYMBOL_REF_FAR_ADDR_P (op0) + && (SYMBOL_REF_LOCAL_P (op0) + || (HAVE_LD_PIE_COPYRELOC ++ && !TARGET_HAS_BIONIC + && flag_pie + && !SYMBOL_REF_WEAK (op0) + && !SYMBOL_REF_FUNCTION_P (op0))) +diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h +index 4b9910f..c33e074 100644 +--- a/gcc/config/i386/linux-common.h ++++ b/gcc/config/i386/linux-common.h +@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see + #undef CC1_SPEC + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC \ ++ ANDROID_TARGET_CC1_SPEC \ ++ " " \ ++ ANDROID_CC1_SPEC("-fPIC")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + + #undef LINK_SPEC + #define LINK_SPEC \ +diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h +index a1f98d3..3725799 100644 +--- a/gcc/config/i386/pmm_malloc.h ++++ b/gcc/config/i386/pmm_malloc.h +@@ -31,7 +31,7 @@ + #ifndef __cplusplus + extern int posix_memalign (void **, size_t, size_t); + #else +-extern "C" int posix_memalign (void **, size_t, size_t) throw (); ++extern "C" int posix_memalign (void **, size_t, size_t); + #endif + + static __inline void * +diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h +index 301a41c..bebfe7f 100644 +--- a/gcc/config/linux-android.h ++++ b/gcc/config/linux-android.h +@@ -38,15 +38,18 @@ + "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}" + + #define ANDROID_LINK_SPEC \ +- "%{shared: -Bsymbolic}" ++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now" + +-#define ANDROID_CC1_SPEC \ ++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \ + "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \ +- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}" ++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}" + + #define ANDROID_CC1PLUS_SPEC \ +- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \ +- "%{!frtti:%{!fno-rtti: -fno-rtti}}" ++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \ ++ "%{!frtti:%{!fno-rtti: -frtti}}" ++ ++#define ANDROID_ASM_SPEC \ ++ "--noexecstack" + + #define ANDROID_LIB_SPEC \ + "%{!static: -ldl}" +diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h +new file mode 100644 +index 0000000..32c539c +--- /dev/null ++++ b/gcc/config/mips/android.h +@@ -0,0 +1,49 @@ ++/* Target macros for mips*-*android* targets. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; either version 3, or (at your option) ++any later version. ++ ++GCC is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#undef DRIVER_SELF_SPECS ++#define DRIVER_SELF_SPECS \ ++ /* Make sure a -mips option is present. This helps us to pick \ ++ the right multilib, and also makes the later specs easier \ ++ to write. */ \ ++ MIPS_ISA_LEVEL_SPEC, \ ++ \ ++ /* Infer the default float setting from -march. */ \ ++ MIPS_ARCH_FLOAT_SPEC, \ ++ \ ++ /* Infer the -msynci setting from -march if not explicitly set. */ \ ++ MIPS_ISA_SYNCI_SPEC, \ ++ \ ++ /* If no ABI option is specified, infer one from the ISA level \ ++ or -mgp setting. */ \ ++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \ ++ \ ++ /* If no FP ABI option is specified, infer one from the \ ++ ABI/ISA level unless there is a conflicting option. */ \ ++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \ ++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \ ++ \ ++ /* If no odd-spreg option is specified, infer one from the ISA. */ \ ++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \ ++ \ ++ /* Base SPECs. */ \ ++ BASE_DRIVER_SELF_SPECS, \ ++ \ ++ /* Use the standard linux specs for everything else. */ \ ++ LINUX_DRIVER_SELF_SPECS +diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h +index 15b549c..4a28160 100644 +--- a/gcc/config/mips/gnu-user.h ++++ b/gcc/config/mips/gnu-user.h +@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see + /* The GNU C++ standard library requires this. */ \ + if (c_dialect_cxx ()) \ + builtin_define ("_GNU_SOURCE"); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ + } while (0) + + #undef SUBTARGET_CPP_SPEC +@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see + + #undef SUBTARGET_ASM_SPEC + #define SUBTARGET_ASM_SPEC \ +- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}" ++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + /* The MIPS assembler has different syntax for .set. We set it to + .dummy to trap any errors. */ +@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); + #endif + + #define LINUX_DRIVER_SELF_SPECS \ +- NO_SHARED_SPECS \ ++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \ + MARCH_MTUNE_NATIVE_SPECS, \ + /* -mplt has no effect without -mno-shared. Simplify later \ + specs handling by removing a redundant option. */ \ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8429a7c..4b0825d 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see + #undef SUBTARGET_CC1_SPEC + #define SUBTARGET_CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) + + #undef CC1PLUS_SPEC + #define CC1PLUS_SPEC \ +@@ -62,3 +62,7 @@ along with GCC; see the file COPYING3. If not see + + /* The default value isn't sufficient in 64-bit mode. */ + #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) ++ ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif +diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android +new file mode 100644 +index 0000000..39f512c +--- /dev/null ++++ b/gcc/config/mips/t-linux-android +@@ -0,0 +1,3 @@ ++MULTILIB_OPTIONS = mips32r2/mips32r6 ++MULTILIB_DIRNAMES = mips-r2 mips-r6 ++MULTILIB_OSDIRNAMES = ../libr2 ../libr6 +diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64 +new file mode 100644 +index 0000000..55cab7d +--- /dev/null ++++ b/gcc/config/mips/t-linux-android64 +@@ -0,0 +1,4 @@ ++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6 ++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6 ++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64 ++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6 +diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h +index 37ecfc4..a5f1b99 100644 +--- a/gcc/config/openbsd.h ++++ b/gcc/config/openbsd.h +@@ -136,8 +136,12 @@ while (0) + #define LIB_SPEC OBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LIB_SPEC + #define LIB_SPEC OBSD_LIB_SPEC +diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h +index cbf9097..eb2217f 100644 +--- a/gcc/config/rs6000/sysv4.h ++++ b/gcc/config/rs6000/sysv4.h +@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN) + -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}" + + #if defined(HAVE_LD_EH_FRAME_HDR) +-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# ifdef USE_EH_FRAME_HDR_FOR_STATIC ++# define LINK_EH_SPEC "--eh-frame-hdr " ++# else ++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# endif + #endif + + #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \ +diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h +index 5160e1f..7632a50 100644 +--- a/gcc/config/sol2.h ++++ b/gcc/config/sol2.h +@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see + /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs + --eh-frame-hdr to create the required .eh_frame_hdr sections. */ + #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++#endif + #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */ + #endif + +diff --git a/gcc/configure b/gcc/configure +index 1c6e340..28ad050 100755 +--- a/gcc/configure ++++ b/gcc/configure +@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769 + enable_fix_cortex_a53_843419 + with_glibc_version + enable_gnu_unique_object ++enable_eh_frame_hdr_for_static + enable_linker_build_id + enable_default_ssp + with_long_double_128 +@@ -1670,6 +1671,9 @@ Optional Features: + --enable-gnu-unique-object + enable the use of the @gnu_unique_object ELF + extension on glibc systems ++ --enable-eh-frame-hdr-for-static ++ enable linker PT_GNU_EH_FRAME support for static ++ executable + --enable-linker-build-id + compiler will always pass --build-id to linker + --enable-default-ssp enable Stack Smashing Protection as default +@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + + $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h + ++ # Check whether --enable-eh-frame-hdr-for-static was given. ++if test "${enable_eh_frame_hdr_for_static+set}" = set; then : ++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;; ++ esac ++else ++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi ++fi ++ ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ ++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h ++ ++ fi + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5 + $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; } +diff --git a/gcc/configure.ac b/gcc/configure.ac +index 6c1dcd9..0cf7419 100644 +--- a/gcc/configure.ac ++++ b/gcc/configure.ac +@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR]) + if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1, + [Define if your linker supports .eh_frame_hdr.]) ++ AC_ARG_ENABLE(eh-frame-hdr-for-static, ++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static], ++ [enable linker PT_GNU_EH_FRAME support for static executable])], ++ [case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'.]) ;; ++ esac], ++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ [[if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi]]) ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1, ++[Define if your system supports PT_GNU_EH_FRAME for static executable.]) ++ fi + fi + AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr) + +diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C +new file mode 100644 +index 0000000..15408ef +--- /dev/null ++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C +@@ -0,0 +1,25 @@ ++// PR c++/4381 ++// Test that exception-specs work properly for classes with virtual bases. ++ ++// { dg-do run } ++// { dg-options "-static" } ++ ++class Base {}; ++ ++struct A : virtual public Base ++{ ++ A() {} ++}; ++ ++struct B {}; ++ ++void func() throw (B,A) ++{ ++ throw A(); ++} ++ ++int main(void) ++{ ++ try { func(); } ++ catch (A& a) { } ++} +diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c +index f3343fc..d426477 100644 +--- a/libgcc/crtstuff.c ++++ b/libgcc/crtstuff.c +@@ -88,7 +88,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(BSD_DL_ITERATE_PHDR_AVAILABLE) + #include + # define USE_PT_GNU_EH_FRAME +@@ -97,7 +98,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__sun__) && defined(__svr4__) + #include + # define USE_PT_GNU_EH_FRAME +@@ -106,7 +108,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__GLIBC__) && __GLIBC__ >= 2 + #include + /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h. +@@ -121,7 +124,7 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(CRTSTUFFT_O) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(inhibit_libc) \ + && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__)) + /* On systems using glibc, an inhibit_libc build of libgcc is only +diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h +index 555c0fe..47c8655 100644 +--- a/libgcc/gthr-posix.h ++++ b/libgcc/gthr-posix.h +@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define __GTHREADS 1 + #define __GTHREADS_CXX0X 1 + ++/* The following should normally be in a different header file, ++ * but I couldn't find the right location. The point of the macro ++ * definition below is to prevent libsupc++ and libstdc++ to reference ++ * weak symbols in their static C++ constructors. Such code crashes ++ * when a shared object linked statically to these libraries is ++ * loaded on Android 2.1 (Eclair) and older platform releases, due ++ * to a dynamic linker bug. ++ */ ++#ifdef __ANDROID__ ++#undef GTHREAD_USE_WEAK ++#define GTHREAD_USE_WEAK 0 ++#endif ++ + #include + + #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \ +diff --git a/libgcc/gthr.h b/libgcc/gthr.h +index 47a7d06..67a680f 100644 +--- a/libgcc/gthr.h ++++ b/libgcc/gthr.h +@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define GTHREAD_USE_WEAK 1 + #endif + #endif ++#if __ANDROID__ ++#include "gthr-posix.h" ++#else + #include "gthr-default.h" ++#endif + + #ifndef HIDE_EXPORTS + #pragma GCC visibility pop +diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure +index 41797a9..5a631dd 100755 +--- a/libstdc++-v3/configure ++++ b/libstdc++-v3/configure +@@ -78319,6 +78319,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +@@ -78377,6 +78383,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h +index e3e206b..e85dc2c 100644 +--- a/libstdc++-v3/include/bits/locale_facets.h ++++ b/libstdc++-v3/include/bits/locale_facets.h +@@ -47,6 +47,20 @@ + #include + #include + ++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__ ++// CrystaX: for some reason, x86 gcc-4.9 makes ctype::do_widen() and ++// ctype::_M_widen_init() methods working wrong if optimization enabled. ++// For ctype::do_widen(), values of passed arguments (__lo, __hi and __to) ++// are completely messed up and don't correspond to passed values. In case if ++// we disable optimization for those methods, things become correct so we apply ++// this workaround here for a time. ++// TODO: figure out what exactly wrong here - is it bug in GCC optimization ++// algorithm or smth else? ++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0))) ++#else ++#define __CRYSTAX_X86_DONT_OPTIMIZE ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + * @return @a __hi. + */ + virtual const char* +- do_widen(const char* __lo, const char* __hi, char_type* __to) const ++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE + { + __builtin_memcpy(__to, __lo, __hi - __lo); + return __hi; +@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + + private: + void _M_narrow_init() const; +- void _M_widen_init() const; ++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE; + }; + + #ifdef _GLIBCXX_USE_WCHAR_T +diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc +index 9b61799..c149169 100644 +--- a/libstdc++-v3/libsupc++/guard.cc ++++ b/libstdc++-v3/libsupc++/guard.cc +@@ -33,7 +33,12 @@ + #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \ + && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX) + # include ++#if defined(__ANDROID__) ++# include ++# define SYS_futex __NR_futex ++#else + # include ++#endif + # include + # define _GLIBCXX_USE_FUTEX + # define _GLIBCXX_FUTEX_WAIT 0 diff --git a/patches/gcc/6.3.0/951-bionic-ndk.patch b/patches/gcc/6.3.0/951-bionic-ndk.patch new file mode 100644 index 0000000..59c50a8 --- /dev/null +++ b/patches/gcc/6.3.0/951-bionic-ndk.patch @@ -0,0 +1,58 @@ +commit d38d37bdfe24b7ce1bdcb55642fb6b904718e68f +Author: Howard Chu +Date: Tue Apr 25 19:02:18 2017 -0700 + + Fix ctype for newer NDK headers + +diff --git a/libstdc++-v3/config/os/bionic/ctype_base.h b/libstdc++-v3/config/os/bionic/ctype_base.h +index 33978f3..c36e63c 100644 +--- a/libstdc++-v3/config/os/bionic/ctype_base.h ++++ b/libstdc++-v3/config/os/bionic/ctype_base.h +@@ -28,6 +28,18 @@ + + // Information as gleaned from /usr/include/ctype.h + ++// _CTYPE prefix was added in NDK r14 unified headers ++#ifndef _CTYPE_U ++#define _CTYPE_U _U ++#define _CTYPE_L _L ++#define _CTYPE_D _N ++#define _CTYPE_S _S ++#define _CTYPE_P _P ++#define _CTYPE_C _C ++#define _CTYPE_X _X ++#define _CTYPE_B _B ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -41,17 +53,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + // NB: Offsets into ctype::_M_table force a particular size + // on the mask type. Because of this, we don't use an enum. + typedef char mask; +- static const mask upper = _U; +- static const mask lower = _L; +- static const mask alpha = _U | _L; +- static const mask digit = _N; +- static const mask xdigit = _X | _N; +- static const mask space = _S; +- static const mask print = _P | _U | _L | _N | _B; +- static const mask graph = _P | _U | _L | _N; +- static const mask cntrl = _C; +- static const mask punct = _P; +- static const mask alnum = _U | _L | _N; ++ static const mask upper = _CTYPE_U; ++ static const mask lower = _CTYPE_L; ++ static const mask alpha = _CTYPE_U | _CTYPE_L; ++ static const mask digit = _CTYPE_D; ++ static const mask xdigit = _CTYPE_X | _CTYPE_D; ++ static const mask space = _CTYPE_S; ++ static const mask print = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D | _CTYPE_B; ++ static const mask graph = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D; ++ static const mask cntrl = _CTYPE_C; ++ static const mask punct = _CTYPE_P; ++ static const mask alnum = _CTYPE_U | _CTYPE_L | _CTYPE_D; + #if __cplusplus >= 201103L + static const mask blank = space; + #endif diff --git a/patches/gcc/6.3.0/952-bionic-errno.patch b/patches/gcc/6.3.0/952-bionic-errno.patch new file mode 100644 index 0000000..91f6ca3 --- /dev/null +++ b/patches/gcc/6.3.0/952-bionic-errno.patch @@ -0,0 +1,19 @@ +commit 6cd4ad106ef87a3c58b0c3478e78409b47000de0 +Author: Howard Chu +Date: Tue Apr 25 20:17:03 2017 -0700 + + Fix, errno is volatile int + +diff --git a/libstdc++-v3/src/filesystem/dir.cc b/libstdc++-v3/src/filesystem/dir.cc +index 6ff12d0..5bbd664 100644 +--- a/libstdc++-v3/src/filesystem/dir.cc ++++ b/libstdc++-v3/src/filesystem/dir.cc +@@ -147,7 +147,7 @@ fs::_Dir::advance(error_code* ec, directory_options options) + + int err = std::exchange(errno, 0); + const auto entp = readdir(dirp); +- std::swap(errno, err); ++ std::swap((int&)errno, err); + + if (entp) + { diff --git a/samples/aarch64-unknown-linux-android/crosstool.config b/samples/aarch64-unknown-linux-android/crosstool.config new file mode 100644 index 0000000..fa147e2 --- /dev/null +++ b/samples/aarch64-unknown-linux-android/crosstool.config @@ -0,0 +1,10 @@ +# CT_RM_RF_PREFIX_DIR is not set +CT_ARCH_arm=y +CT_ARCH_64=y +CT_ARCH_ARCH="armv8-a" +CT_STATIC_TOOLCHAIN=y +CT_KERNEL_linux=y +CT_LIBC_BIONIC_V_14b=y +CT_ANDROID_API_21=y +CT_CC_LANG_CXX=y +CT_GETTEXT=y diff --git a/samples/aarch64-unknown-linux-android/reported.by b/samples/aarch64-unknown-linux-android/reported.by new file mode 100644 index 0000000..fb4b9ec --- /dev/null +++ b/samples/aarch64-unknown-linux-android/reported.by @@ -0,0 +1,3 @@ +reporter_name="Howard Chu" +reporter_url="http://www.symas.com" +reporter_comment="Config to build cross-compiler for Android/bionic on ARM64" -- cgit v0.10.2-6-g49f6 From 22e4a3f4334ee74233e40e19f3a72187eed3bb91 Mon Sep 17 00:00:00 2001 From: hyc Date: Mon, 8 May 2017 18:10:39 +0100 Subject: Fix samples defaulted to glibc Since glibc is no longer the default C library diff --git a/samples/aarch64-rpi3-linux-gnueabi/crosstool.config b/samples/aarch64-rpi3-linux-gnueabi/crosstool.config index 01605ac..6d51c12 100644 --- a/samples/aarch64-rpi3-linux-gnueabi/crosstool.config +++ b/samples/aarch64-rpi3-linux-gnueabi/crosstool.config @@ -7,5 +7,6 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_LANG_CXX=y CT_DEBUG_gdb=y diff --git a/samples/aarch64-unknown-linux-gnueabi/crosstool.config b/samples/aarch64-unknown-linux-gnueabi/crosstool.config index e3a393a..8ff183c 100644 --- a/samples/aarch64-unknown-linux-gnueabi/crosstool.config +++ b/samples/aarch64-unknown-linux-gnueabi/crosstool.config @@ -6,5 +6,6 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_LANG_CXX=y CT_DEBUG_gdb=y diff --git a/samples/arm-cortex_a15-linux-gnueabihf/crosstool.config b/samples/arm-cortex_a15-linux-gnueabihf/crosstool.config index 5d0ab9c..76174b6 100644 --- a/samples/arm-cortex_a15-linux-gnueabihf/crosstool.config +++ b/samples/arm-cortex_a15-linux-gnueabihf/crosstool.config @@ -8,6 +8,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_GCC_SHOW_LINARO=y CT_CC_LANG_CXX=y CT_DEBUG_duma=y diff --git a/samples/arm-cortex_a8-linux-gnueabi/crosstool.config b/samples/arm-cortex_a8-linux-gnueabi/crosstool.config index f5245e2..7b7a74b 100644 --- a/samples/arm-cortex_a8-linux-gnueabi/crosstool.config +++ b/samples/arm-cortex_a8-linux-gnueabi/crosstool.config @@ -7,6 +7,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_LANG_CXX=y CT_DEBUG_duma=y CT_DEBUG_gdb=y diff --git a/samples/arm-cortexa9_neon-linux-gnueabihf/crosstool.config b/samples/arm-cortexa9_neon-linux-gnueabihf/crosstool.config index 9330cce..f5a57dd 100644 --- a/samples/arm-cortexa9_neon-linux-gnueabihf/crosstool.config +++ b/samples/arm-cortexa9_neon-linux-gnueabihf/crosstool.config @@ -10,6 +10,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_LIBC_ADDONS_LIST="libidn" CT_LIBC_LOCALES=y CT_LIBC_GLIBC_KERNEL_VERSION_NONE=y diff --git a/samples/arm-unknown-linux-gnueabi/crosstool.config b/samples/arm-unknown-linux-gnueabi/crosstool.config index 85909e5..20157cf 100644 --- a/samples/arm-unknown-linux-gnueabi/crosstool.config +++ b/samples/arm-unknown-linux-gnueabi/crosstool.config @@ -6,6 +6,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y # CT_CC_GCC_SJLJ_EXCEPTIONS is not set CT_CC_LANG_CXX=y CT_DEBUG_duma=y diff --git a/samples/armeb-unknown-linux-gnueabi/crosstool.config b/samples/armeb-unknown-linux-gnueabi/crosstool.config index eb72062..983c33d 100644 --- a/samples/armeb-unknown-linux-gnueabi/crosstool.config +++ b/samples/armeb-unknown-linux-gnueabi/crosstool.config @@ -7,6 +7,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y # CT_CC_GCC_SJLJ_EXCEPTIONS is not set CT_CC_LANG_CXX=y CT_DEBUG_duma=y diff --git a/samples/armv6-rpi-linux-gnueabi/crosstool.config b/samples/armv6-rpi-linux-gnueabi/crosstool.config index 0affdb7..22fbfaa 100644 --- a/samples/armv6-rpi-linux-gnueabi/crosstool.config +++ b/samples/armv6-rpi-linux-gnueabi/crosstool.config @@ -9,6 +9,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_LIBC_LOCALES=y CT_LIBC_GLIBC_KERNEL_VERSION_CHOSEN=y CT_LIBC_GLIBC_MIN_KERNEL_VERSION="3.2.27" diff --git a/samples/armv7-rpi2-linux-gnueabihf/crosstool.config b/samples/armv7-rpi2-linux-gnueabihf/crosstool.config index 08f903d..30b678d 100644 --- a/samples/armv7-rpi2-linux-gnueabihf/crosstool.config +++ b/samples/armv7-rpi2-linux-gnueabihf/crosstool.config @@ -9,5 +9,6 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_LANG_CXX=y CT_DEBUG_gdb=y diff --git a/samples/armv8-rpi3-linux-gnueabihf/crosstool.config b/samples/armv8-rpi3-linux-gnueabihf/crosstool.config index 117560e..6d1d0f8 100644 --- a/samples/armv8-rpi3-linux-gnueabihf/crosstool.config +++ b/samples/armv8-rpi3-linux-gnueabihf/crosstool.config @@ -9,5 +9,6 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_LANG_CXX=y CT_DEBUG_gdb=y diff --git a/samples/i686-centos6-linux-gnu/crosstool.config b/samples/i686-centos6-linux-gnu/crosstool.config index 204f9fb..86ec1f6 100644 --- a/samples/i686-centos6-linux-gnu/crosstool.config +++ b/samples/i686-centos6-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_ARCH="i686" CT_TARGET_VENDOR="centos6" CT_KERNEL_linux=y CT_KERNEL_V_2_6_32=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_12_2=y CT_CC_LANG_CXX=y diff --git a/samples/i686-centos7-linux-gnu/crosstool.config b/samples/i686-centos7-linux-gnu/crosstool.config index bb9f007..b6403bb 100644 --- a/samples/i686-centos7-linux-gnu/crosstool.config +++ b/samples/i686-centos7-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_ARCH="i686" CT_TARGET_VENDOR="centos7" CT_KERNEL_linux=y CT_KERNEL_V_3_10=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_17=y CT_CC_LANG_CXX=y diff --git a/samples/i686-nptl-linux-gnu/crosstool.config b/samples/i686-nptl-linux-gnu/crosstool.config index a59823c..1f14d67 100644 --- a/samples/i686-nptl-linux-gnu/crosstool.config +++ b/samples/i686-nptl-linux-gnu/crosstool.config @@ -6,6 +6,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_LANG_CXX=y CT_DEBUG_duma=y CT_DEBUG_gdb=y diff --git a/samples/i686-ubuntu12.04-linux-gnu/crosstool.config b/samples/i686-ubuntu12.04-linux-gnu/crosstool.config index 030c109..01c4312 100644 --- a/samples/i686-ubuntu12.04-linux-gnu/crosstool.config +++ b/samples/i686-ubuntu12.04-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_ARCH="i686" CT_TARGET_VENDOR="ubuntu12.04" CT_KERNEL_linux=y CT_KERNEL_V_3_2=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_15=y CT_CC_LANG_CXX=y diff --git a/samples/i686-ubuntu14.04-linux-gnu/crosstool.config b/samples/i686-ubuntu14.04-linux-gnu/crosstool.config index a67103a..e3f459b 100644 --- a/samples/i686-ubuntu14.04-linux-gnu/crosstool.config +++ b/samples/i686-ubuntu14.04-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_ARCH="i686" CT_TARGET_VENDOR="ubuntu14.04" CT_KERNEL_linux=y CT_KERNEL_V_3_13=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_19=y CT_CC_LANG_CXX=y diff --git a/samples/i686-ubuntu16.04-linux-gnu/crosstool.config b/samples/i686-ubuntu16.04-linux-gnu/crosstool.config index 3ceac41..9eec3c1 100644 --- a/samples/i686-ubuntu16.04-linux-gnu/crosstool.config +++ b/samples/i686-ubuntu16.04-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_ARCH="i686" CT_TARGET_VENDOR="ubuntu16.04" CT_KERNEL_linux=y CT_KERNEL_V_4_4=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_23=y CT_CC_LANG_CXX=y diff --git a/samples/mips-ar2315-linux-gnu/crosstool.config b/samples/mips-ar2315-linux-gnu/crosstool.config index 579c801..55ae935 100644 --- a/samples/mips-ar2315-linux-gnu/crosstool.config +++ b/samples/mips-ar2315-linux-gnu/crosstool.config @@ -5,6 +5,7 @@ CT_ARCH_FLOAT_SW=y CT_TARGET_VENDOR="ar2315" CT_KERNEL_linux=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_GCC_mips_plt=y CT_CC_LANG_CXX=y CT_DEBUG_gdb=y diff --git a/samples/mips-malta-linux-gnu/crosstool.config b/samples/mips-malta-linux-gnu/crosstool.config index 308a4d8..624d01d 100644 --- a/samples/mips-malta-linux-gnu/crosstool.config +++ b/samples/mips-malta-linux-gnu/crosstool.config @@ -2,6 +2,7 @@ CT_ARCH_mips=y CT_TARGET_VENDOR="malta" CT_KERNEL_linux=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_GCC_mips_plt=y CT_CC_LANG_CXX=y CT_DEBUG_gdb=y diff --git a/samples/mipsel-multilib-linux-gnu/crosstool.config b/samples/mipsel-multilib-linux-gnu/crosstool.config index 656541c..6f767a3 100644 --- a/samples/mipsel-multilib-linux-gnu/crosstool.config +++ b/samples/mipsel-multilib-linux-gnu/crosstool.config @@ -5,6 +5,7 @@ CT_ARCH_FLOAT_SW=y CT_TARGET_VENDOR="multilib" CT_KERNEL_linux=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_GCC_CORE_EXTRA_CONFIG_ARRAY="--enable-targets=all" CT_CC_GCC_EXTRA_CONFIG_ARRAY="--enable-targets=all" CT_CC_GCC_mips_plt=y diff --git a/samples/mipsel-unknown-linux-gnu/crosstool.config b/samples/mipsel-unknown-linux-gnu/crosstool.config index 39ed4d6..11320bf 100644 --- a/samples/mipsel-unknown-linux-gnu/crosstool.config +++ b/samples/mipsel-unknown-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_ARCH="mips1" CT_ARCH_FLOAT_SW=y CT_KERNEL_linux=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_CC_GCC_mips_plt=y CT_CC_LANG_CXX=y CT_DEBUG_duma=y diff --git a/samples/x86_64-centos6-linux-gnu/crosstool.config b/samples/x86_64-centos6-linux-gnu/crosstool.config index 4eb4611..ad84a6c 100644 --- a/samples/x86_64-centos6-linux-gnu/crosstool.config +++ b/samples/x86_64-centos6-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_64=y CT_TARGET_VENDOR="centos6" CT_KERNEL_linux=y CT_KERNEL_V_2_6_32=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_12_2=y CT_CC_LANG_CXX=y diff --git a/samples/x86_64-centos7-linux-gnu/crosstool.config b/samples/x86_64-centos7-linux-gnu/crosstool.config index 4a441ef..8ca023c 100644 --- a/samples/x86_64-centos7-linux-gnu/crosstool.config +++ b/samples/x86_64-centos7-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_64=y CT_TARGET_VENDOR="centos7" CT_KERNEL_linux=y CT_KERNEL_V_3_10=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_17=y CT_CC_LANG_CXX=y diff --git a/samples/x86_64-multilib-linux-gnu/crosstool.config b/samples/x86_64-multilib-linux-gnu/crosstool.config index c2e4f69..d3e5b63 100644 --- a/samples/x86_64-multilib-linux-gnu/crosstool.config +++ b/samples/x86_64-multilib-linux-gnu/crosstool.config @@ -7,6 +7,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_LIBC_GLIBC_KERNEL_VERSION_NONE=y CT_CC_GCC_MULTILIB_LIST="m64,m32,mx32" CT_CC_GCC_LNK_HASH_STYLE_BOTH=y diff --git a/samples/x86_64-ubuntu12.04-linux-gnu/crosstool.config b/samples/x86_64-ubuntu12.04-linux-gnu/crosstool.config index c09ab9d..3e2bc3f 100644 --- a/samples/x86_64-ubuntu12.04-linux-gnu/crosstool.config +++ b/samples/x86_64-ubuntu12.04-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_64=y CT_TARGET_VENDOR="ubuntu12.04" CT_KERNEL_linux=y CT_KERNEL_V_3_2=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_15=y CT_CC_LANG_CXX=y diff --git a/samples/x86_64-ubuntu14.04-linux-gnu/crosstool.config b/samples/x86_64-ubuntu14.04-linux-gnu/crosstool.config index 68b3102..4aa16c7 100644 --- a/samples/x86_64-ubuntu14.04-linux-gnu/crosstool.config +++ b/samples/x86_64-ubuntu14.04-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_64=y CT_TARGET_VENDOR="ubuntu14.04" CT_KERNEL_linux=y CT_KERNEL_V_3_13=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_19=y CT_CC_LANG_CXX=y diff --git a/samples/x86_64-ubuntu16.04-linux-gnu/crosstool.config b/samples/x86_64-ubuntu16.04-linux-gnu/crosstool.config index fc10edb..37be385 100644 --- a/samples/x86_64-ubuntu16.04-linux-gnu/crosstool.config +++ b/samples/x86_64-ubuntu16.04-linux-gnu/crosstool.config @@ -4,6 +4,7 @@ CT_ARCH_64=y CT_TARGET_VENDOR="ubuntu16.04" CT_KERNEL_linux=y CT_KERNEL_V_4_4=y +CT_LIBC_glibc=y # CT_KERNEL_LINUX_INSTALL_CHECK is not set CT_LIBC_GLIBC_V_2_23=y CT_CC_LANG_CXX=y diff --git a/samples/x86_64-unknown-linux-gnu/crosstool.config b/samples/x86_64-unknown-linux-gnu/crosstool.config index e88ae77..3f7c338 100644 --- a/samples/x86_64-unknown-linux-gnu/crosstool.config +++ b/samples/x86_64-unknown-linux-gnu/crosstool.config @@ -5,6 +5,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_PLUGINS=y +CT_LIBC_glibc=y CT_LIBC_GLIBC_KERNEL_VERSION_NONE=y CT_CC_GCC_LNK_HASH_STYLE_BOTH=y CT_CC_LANG_CXX=y diff --git a/samples/x86_64-w64-mingw32,arm-cortexa9_neon-linux-gnueabihf/crosstool.config b/samples/x86_64-w64-mingw32,arm-cortexa9_neon-linux-gnueabihf/crosstool.config index 5690be3..34d1ea1 100644 --- a/samples/x86_64-w64-mingw32,arm-cortexa9_neon-linux-gnueabihf/crosstool.config +++ b/samples/x86_64-w64-mingw32,arm-cortexa9_neon-linux-gnueabihf/crosstool.config @@ -11,6 +11,7 @@ CT_KERNEL_linux=y CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y +CT_LIBC_glibc=y CT_LIBC_ADDONS_LIST="libidn" CT_LIBC_LOCALES=y CT_LIBC_GLIBC_KERNEL_VERSION_NONE=y diff --git a/samples/x86_64-w64-mingw32,x86_64-pc-linux-gnu/crosstool.config b/samples/x86_64-w64-mingw32,x86_64-pc-linux-gnu/crosstool.config index 631e8f0..efd43a0 100644 --- a/samples/x86_64-w64-mingw32,x86_64-pc-linux-gnu/crosstool.config +++ b/samples/x86_64-w64-mingw32,x86_64-pc-linux-gnu/crosstool.config @@ -10,6 +10,7 @@ CT_BINUTILS_LINKER_LD_GOLD=y CT_BINUTILS_GOLD_THREADS=y CT_BINUTILS_LD_WRAPPER=y CT_BINUTILS_FOR_TARGET=y +CT_LIBC_glibc=y CT_CC_LANG_CXX=y CT_DEBUG_ltrace=y CT_DEBUG_strace=y -- cgit v0.10.2-6-g49f6 From 131011493b7550b5ed4bdf241e8432aeae7694e8 Mon Sep 17 00:00:00 2001 From: Alexey Neyman Date: Sun, 14 May 2017 12:24:37 -0700 Subject: Split the patch from CrystaX to separate commits ... and edit the biggest chunk as in previous 950-bionic-android.patch. Also, drop arm_neon.h - it is a separate package, will be added separately. Signed-off-by: Alexey Neyman diff --git a/patches/gcc/6.3.0/950-bionic-android.patch b/patches/gcc/6.3.0/950-bionic-android.patch deleted file mode 100644 index 1ae1186..0000000 --- a/patches/gcc/6.3.0/950-bionic-android.patch +++ /dev/null @@ -1,17570 +0,0 @@ -Based on https://github.com/crystax/android-toolchain-gcc-6/commits/master - -Up to commit e510dbdc79714512ac87126f1832f366d56623b6 but with -Crystax-specific lib support omitted -diff --git a/gcc/config.gcc b/gcc/config.gcc -index f66e48c..4380657 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -374,6 +374,7 @@ i[34567]86-*-*) - rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h - adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h - avx512cdintrin.h avx512erintrin.h avx512pfintrin.h -+ arm_neon.h - shaintrin.h clflushoptintrin.h xsavecintrin.h - xsavesintrin.h avx512dqintrin.h avx512bwintrin.h - avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h -@@ -396,6 +397,7 @@ x86_64-*-*) - rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h - adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h - avx512cdintrin.h avx512erintrin.h avx512pfintrin.h -+ arm_neon.h - shaintrin.h clflushoptintrin.h xsavecintrin.h - xsavesintrin.h avx512dqintrin.h avx512bwintrin.h - avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h -@@ -942,13 +944,17 @@ aarch64*-*-elf | aarch64*-*-rtems*) - TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'` - ;; - aarch64*-*-linux*) -- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h" -+ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h" - tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h" -+ extra_options="${extra_options} linux-android.opt" - tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux" - case $target in - aarch64_be-*) - tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1" - ;; -+ aarch64*-*-linux-android*) -+ tm_file="${tm_file} aarch64/aarch64-linux-android.h" -+ ;; - esac - aarch64_multilibs="${with_multilib_list}" - if test "$aarch64_multilibs" = "default"; then -@@ -2055,6 +2061,17 @@ mips*-*-linux*) # Linux MIPS, either endian. - tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h" - extra_options="${extra_options} linux-android.opt" - case ${target} in -+ mips64*android*) -+ default_mips_arch=mips64r6 -+ default_mips_abi=64 -+ tm_file="${tm_file} mips/android.h" -+ tmake_file="${tmake_file} mips/t-linux-android64" -+ ;; -+ mips*android*) -+ default_mips_arch=mips32 -+ tm_file="${tm_file} mips/android.h" -+ tmake_file="$tmake_file mips/t-linux-android" -+ ;; - mipsisa32r6*) - default_mips_arch=mips32r6 - ;; -diff --git a/gcc/config.in b/gcc/config.in -index 115cb61..9339168 100644 ---- a/gcc/config.in -+++ b/gcc/config.in -@@ -2119,6 +2119,12 @@ - #endif - - -+/* Define if your system supports PT_GNU_EH_FRAME for static executable. */ -+#ifndef USED_FOR_TARGET -+#undef USE_EH_FRAME_HDR_FOR_STATIC -+#endif -+ -+ - /* Define to 1 if the 'long long' type is wider than 'long' but still - efficiently supported by the host hardware. */ - #ifndef USED_FOR_TARGET -diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h -new file mode 100644 -index 0000000..cffd84d ---- /dev/null -+++ b/gcc/config/aarch64/aarch64-linux-android.h -@@ -0,0 +1,63 @@ -+/* Machine description for AArch64 architecture. -+ Copyright (C) 2014 Free Software Foundation, Inc. -+ -+ This file is part of GCC. -+ -+ GCC is free software; you can redistribute it and/or modify it -+ under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 3, or (at your option) -+ any later version. -+ -+ GCC is distributed in the hope that it will be useful, but -+ WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with GCC; see the file COPYING3. If not see -+ . */ -+ -+#ifndef GCC_AARCH64_LINUX_ANDROID_H -+#define GCC_AARCH64_LINUX_ANDROID_H -+ -+ -+#undef TARGET_OS_CPP_BUILTINS -+#define TARGET_OS_CPP_BUILTINS() \ -+ do \ -+ { \ -+ GNU_USER_TARGET_OS_CPP_BUILTINS(); \ -+ ANDROID_TARGET_OS_CPP_BUILTINS(); \ -+ } \ -+ while (0) -+ -+#undef LINK_SPEC -+#define LINK_SPEC \ -+ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ -+ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) -+ -+#undef CC1_SPEC -+#define CC1_SPEC \ -+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ -+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) -+ -+#define CC1PLUS_SPEC \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) -+ -+#undef LIB_SPEC -+#define LIB_SPEC \ -+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ -+ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC) -+ -+#undef STARTFILE_SPEC -+#define STARTFILE_SPEC \ -+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) -+ -+#undef ENDFILE_SPEC -+#define ENDFILE_SPEC \ -+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) -+ -+#ifdef IN_LIBGCC2 -+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) -+#endif -+ -+#endif /* GCC_AARCH64_LINUX_ANDROID_H */ -diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h -index 5fcaa59..6864195 100644 ---- a/gcc/config/aarch64/aarch64-linux.h -+++ b/gcc/config/aarch64/aarch64-linux.h -@@ -21,7 +21,14 @@ - #ifndef GCC_AARCH64_LINUX_H - #define GCC_AARCH64_LINUX_H - --#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" -+#ifndef RUNTIME_ROOT_PREFIX -+#define RUNTIME_ROOT_PREFIX "" -+#endif -+#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" -+#ifdef BIONIC_DYNAMIC_LINKER -+#undef BIONIC_DYNAMIC_LINKER -+#endif -+#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64" - - #undef MUSL_DYNAMIC_LINKER - #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" -diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h -index 093c38b..54b3e0c 100644 ---- a/gcc/config/alpha/elf.h -+++ b/gcc/config/alpha/elf.h -@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number; - I imagine that other systems will catch up. In the meantime, it - doesn't harm to make sure that the data exists to be used later. */ - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif -diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c -index 5974c65..71b2c7a 100644 ---- a/gcc/config/arm/arm.c -+++ b/gcc/config/arm/arm.c -@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code) - memsize = MEM_SIZE (x); - - /* Only certain alignment specifiers are supported by the hardware. */ -- if (memsize == 32 && (align % 32) == 0) -+ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC -+ honors stricter alignment of composite type in user code, it doesn't -+ observe the alignment of memory passed as an extra argument for function -+ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */ -+ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED) - align_bits = 256; -- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0) -+ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED) - align_bits = 128; - else if (memsize >= 8 && (align % 8) == 0) - align_bits = 64; -diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h -index ad123dd..97b059d 100644 ---- a/gcc/config/arm/arm.h -+++ b/gcc/config/arm/arm.h -@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes - - #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ - || (TARGET_THUMB1 \ -+ && !inline_thumb1_jump_table \ - && (optimize_size || flag_pic))) - - #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ -- (TARGET_THUMB1 \ -+ (TARGET_THUMB1 && !inline_thumb1_jump_table \ - ? (min >= 0 && max < 512 \ - ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ - : min >= -256 && max < 256 \ -diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md -index 47171b9..eb22d11 100644 ---- a/gcc/config/arm/arm.md -+++ b/gcc/config/arm/arm.md -@@ -8179,7 +8179,7 @@ - (match_operand:SI 2 "const_int_operand" "") ; total range - (match_operand:SI 3 "" "") ; table label - (match_operand:SI 4 "" "")] ; Out of range label -- "TARGET_32BIT || optimize_size || flag_pic" -+ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)" - " - { - enum insn_code code; -diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt -index 0ebe017..7724538 100644 ---- a/gcc/config/arm/arm.opt -+++ b/gcc/config/arm/arm.opt -@@ -193,6 +193,10 @@ mthumb-interwork - Target Report Mask(INTERWORK) - Support calls between Thumb and ARM instruction sets. - -+minline-thumb1-jumptable -+Target Report Var(inline_thumb1_jump_table) -+Inline Thumb1 Jump table code -+ - mtls-dialect= - Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU) - Specify thread local storage scheme. -diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h -index 77f3055..32158ed 100644 ---- a/gcc/config/arm/elf.h -+++ b/gcc/config/arm/elf.h -@@ -56,8 +56,7 @@ - #undef SUBSUBTARGET_EXTRA_SPECS - #define SUBSUBTARGET_EXTRA_SPECS - --#ifndef ASM_SPEC --#define ASM_SPEC "\ -+#define DEFAULT_ASM_SPEC "\ - %{mbig-endian:-EB} \ - %{mlittle-endian:-EL} \ - %(asm_cpu_spec) \ -@@ -66,6 +65,9 @@ - %{mthumb-interwork:-mthumb-interwork} \ - %{mfloat-abi=*} %{mfpu=*} \ - %(subtarget_extra_asm_spec)" -+ -+#ifndef ASM_SPEC -+#define ASM_SPEC DEFAULT_ASM_SPEC - #endif - - /* The ARM uses @ are a comment character so we need to redefine -@@ -104,7 +106,8 @@ - the code more efficient, but for Thumb-1 it's better to put them out of - band unless we are generating compressed tables. */ - #define JUMP_TABLES_IN_TEXT_SECTION \ -- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) -+ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \ -+ && (optimize_size || flag_pic))) - - #ifndef LINK_SPEC - #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" -diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h -index ace8481..ed7f4d0 100644 ---- a/gcc/config/arm/linux-eabi.h -+++ b/gcc/config/arm/linux-eabi.h -@@ -108,11 +108,16 @@ - #define CC1_SPEC \ - LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \ - GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \ -- ANDROID_CC1_SPEC) -+ ANDROID_CC1_SPEC("-fpic")) - - #define CC1PLUS_SPEC \ - LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) - -+#undef ASM_SPEC -+#define ASM_SPEC \ -+ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \ -+ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC) -+ - #undef LIB_SPEC - #define LIB_SPEC \ - LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ -diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi -index 8f1307c..cbbec5b 100644 ---- a/gcc/config/arm/t-linux-androideabi -+++ b/gcc/config/arm/t-linux-androideabi -@@ -1,8 +1,9 @@ --MULTILIB_OPTIONS = march=armv7-a mthumb --MULTILIB_DIRNAMES = armv7-a thumb --MULTILIB_EXCEPTIONS = -+MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard -+MULTILIB_DIRNAMES = armv7-a thumb hard -+MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard* - MULTILIB_MATCHES = - MULTILIB_OSDIRNAMES = -+MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch - - # The "special" multilib can be used to build native applications for Android, - # as opposed to native shared libraries that are then called via JNI. -diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h -index 5ded869..5f51ac8 100644 ---- a/gcc/config/freebsd.h -+++ b/gcc/config/freebsd.h -@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see - #define LIB_SPEC FBSD_LIB_SPEC - - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif - - #ifdef TARGET_LIBC_PROVIDES_SSP - #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \ -diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h -index b0bf40a..d1874bc 100644 ---- a/gcc/config/gnu-user.h -+++ b/gcc/config/gnu-user.h -@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC - - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif - - #undef LINK_GCC_C_SEQUENCE_SPEC - #define LINK_GCC_C_SEQUENCE_SPEC \ -diff --git a/gcc/config/i386/arm_neon.h b/gcc/config/i386/arm_neon.h -new file mode 100644 -index 0000000..10944f3 ---- /dev/null -+++ b/gcc/config/i386/arm_neon.h -@@ -0,0 +1,16622 @@ -+//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com -+ -+//*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved. -+ -+//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -+ -+//By downloading, copying, installing or using the software you agree to this license. -+//If you do not agree to this license, do not download, install, copy or use the software. -+ -+// License Agreement -+ -+//Permission to use, copy, modify, and/or distribute this software for any -+//purpose with or without fee is hereby granted, provided that the above -+//copyright notice and this permission notice appear in all copies. -+ -+//THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH -+//REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY -+//AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, -+//INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM -+//LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR -+//OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -+//PERFORMANCE OF THIS SOFTWARE. -+ -+//***************************************************************************************** -+// This file is intended to simplify ARM->IA32 porting -+// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h") -+// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below -+// MMX instruction set is not used due to performance overhead and the necessity to use the -+// EMMS instruction (_mm_empty())for mmx-x87 floating point switching -+//***************************************************************************************** -+ -+//!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual. -+//!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for -+//!!!!!!! greater performance. It can be done by -msse4.2 compiler switch. -+ -+#ifndef NEON2SSE_H -+#define NEON2SSE_H -+ -+#ifndef USE_SSE4 -+#if defined(__SSE4_2__) -+ #define USE_SSE4 -+#endif -+#endif -+ -+#include //SSE -+#include //SSE2 -+#include //SSE3 -+#include //SSSE3 -+#ifdef USE_SSE4 -+#include //SSE4.1 -+#include //SSE4.2 -+#endif -+ -+ -+//*************** functions and data attributes, compiler dependent ********************************* -+//*********************************************************************************** -+#ifdef __GNUC__ -+#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -+#define _NEON2SSE_ALIGN_16 __attribute__((aligned(16))) -+#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -+#if _GCC_VERSION < 40500 -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function -+#else -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function -+#endif -+#if defined(__x86_64__) -+ #define _NEON2SSE_64BIT __x86_64__ -+#endif -+#else -+#define _NEON2SSE_ALIGN_16 __declspec(align(16)) -+#define _NEON2SSE_INLINE __inline -+#if defined(_MSC_VER)|| defined (__INTEL_COMPILER) -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function -+#if defined(_M_X64) -+ #define _NEON2SSE_64BIT _M_X64 -+#endif -+#else -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function -+#endif -+#endif -+ -+#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4) -+ #define _NEON2SSE_64BIT_SSE4 -+#endif -+ -+/*********************************************************************************************************************/ -+// data types conversion -+/*********************************************************************************************************************/ -+#if defined(_MSC_VER) && (_MSC_VER < 1300) -+ typedef signed char int8_t; -+ typedef unsigned char uint8_t; -+ typedef signed short int16_t; -+ typedef unsigned short uint16_t; -+ typedef signed int int32_t; -+ typedef unsigned int uint32_t; -+ typedef signed long long int64_t; -+ typedef unsigned long long uint64_t; -+#elif defined(_MSC_VER) -+ typedef signed __int8 int8_t; -+ typedef unsigned __int8 uint8_t; -+ typedef signed __int16 int16_t; -+ typedef unsigned __int16 uint16_t; -+ typedef signed __int32 int32_t; -+ typedef unsigned __int32 uint32_t; -+ -+ typedef signed long long int64_t; -+ typedef unsigned long long uint64_t; -+#else -+#include -+#include -+#endif -+ -+typedef union __m64_128 { -+ uint64_t m64_u64[1]; -+ float m64_f32[2]; -+ int8_t m64_i8[8]; -+ int16_t m64_i16[4]; -+ int32_t m64_i32[2]; -+ int64_t m64_i64[1]; -+ uint8_t m64_u8[8]; -+ uint16_t m64_u16[4]; -+ uint32_t m64_u32[2]; -+} __m64_128; -+ -+typedef __m64_128 int8x8_t; -+typedef __m64_128 uint8x8_t; -+typedef __m64_128 int16x4_t; -+typedef __m64_128 uint16x4_t; -+typedef __m64_128 int32x2_t; -+typedef __m64_128 uint32x2_t; -+typedef __m64_128 int64x1_t; -+typedef __m64_128 uint64x1_t; -+typedef __m64_128 poly8x8_t; -+typedef __m64_128 poly16x4_t; -+ -+typedef __m64_128 float32x2_t; -+typedef __m128 float32x4_t; -+ -+typedef __m128 float16x4_t; //not supported by IA, for compatibility -+typedef __m128 float16x8_t; //not supported by IA, for compatibility -+ -+typedef __m128i int8x16_t; -+typedef __m128i int16x8_t; -+typedef __m128i int32x4_t; -+typedef __m128i int64x2_t; -+typedef __m128i uint8x16_t; -+typedef __m128i uint16x8_t; -+typedef __m128i uint32x4_t; -+typedef __m128i uint64x2_t; -+typedef __m128i poly8x16_t; -+typedef __m128i poly16x8_t; -+ -+#if defined(_MSC_VER) -+ #define SINT_MIN (-2147483647 - 1) /* min signed int value */ -+ #define SINT_MAX 2147483647 /* max signed int value */ -+#else -+ #define SINT_MIN INT_MIN /* min signed int value */ -+ #define SINT_MAX INT_MAX /* max signed int value */ -+#endif -+ -+typedef float float32_t; -+#if !defined(__clang__) -+typedef float __fp16; -+#endif -+ -+typedef uint8_t poly8_t; -+typedef uint16_t poly16_t; -+ -+ -+//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in -+//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types -+struct int8x16x2_t { -+ int8x16_t val[2]; -+}; -+struct int16x8x2_t { -+ int16x8_t val[2]; -+}; -+struct int32x4x2_t { -+ int32x4_t val[2]; -+}; -+struct int64x2x2_t { -+ int64x2_t val[2]; -+}; -+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!! -+struct int8x8x2_t { -+ int8x8_t val[2]; -+}; -+struct int16x4x2_t { -+ int16x4_t val[2]; -+}; -+struct int32x2x2_t { -+ int32x2_t val[2]; -+}; -+struct int64x1x2_t { -+ int64x1_t val[2]; -+}; -+ -+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy -+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy -+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy -+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy -+ -+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy -+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy -+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy -+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy -+ -+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */ -+typedef struct int8x16x2_t uint8x16x2_t; -+typedef struct int16x8x2_t uint16x8x2_t; -+typedef struct int32x4x2_t uint32x4x2_t; -+typedef struct int64x2x2_t uint64x2x2_t; -+typedef struct int8x16x2_t poly8x16x2_t; -+typedef struct int16x8x2_t poly16x8x2_t; -+ -+typedef struct int8x8x2_t uint8x8x2_t; -+typedef struct int16x4x2_t uint16x4x2_t; -+typedef struct int32x2x2_t uint32x2x2_t; -+typedef struct int64x1x2_t uint64x1x2_t; -+typedef struct int8x8x2_t poly8x8x2_t; -+typedef struct int16x4x2_t poly16x4x2_t; -+ -+//float -+struct float32x4x2_t { -+ float32x4_t val[2]; -+}; -+struct float16x8x2_t { -+ float16x8_t val[2]; -+}; -+struct float32x2x2_t { -+ float32x2_t val[2]; -+}; -+ -+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy -+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy -+typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy -+typedef float16x8x2_t float16x4x2_t; -+ -+//4 -+struct int8x16x4_t { -+ int8x16_t val[4]; -+}; -+struct int16x8x4_t { -+ int16x8_t val[4]; -+}; -+struct int32x4x4_t { -+ int32x4_t val[4]; -+}; -+struct int64x2x4_t { -+ int64x2_t val[4]; -+}; -+ -+struct int8x8x4_t { -+ int8x8_t val[4]; -+}; -+struct int16x4x4_t { -+ int16x4_t val[4]; -+}; -+struct int32x2x4_t { -+ int32x2_t val[4]; -+}; -+struct int64x1x4_t { -+ int64x1_t val[4]; -+}; -+ -+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy -+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy -+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy -+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy -+ -+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy -+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy -+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy -+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy -+ -+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ -+typedef struct int8x8x4_t uint8x8x4_t; -+typedef struct int16x4x4_t uint16x4x4_t; -+typedef struct int32x2x4_t uint32x2x4_t; -+typedef struct int64x1x4_t uint64x1x4_t; -+typedef struct int8x8x4_t poly8x8x4_t; -+typedef struct int16x4x4_t poly16x4x4_t; -+ -+typedef struct int8x16x4_t uint8x16x4_t; -+typedef struct int16x8x4_t uint16x8x4_t; -+typedef struct int32x4x4_t uint32x4x4_t; -+typedef struct int64x2x4_t uint64x2x4_t; -+typedef struct int8x16x4_t poly8x16x4_t; -+typedef struct int16x8x4_t poly16x8x4_t; -+ -+struct float32x4x4_t { -+ float32x4_t val[4]; -+}; -+struct float16x8x4_t { -+ float16x8_t val[4]; -+}; -+struct float32x2x4_t { -+ float32x2_t val[4]; -+}; -+ -+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy -+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy -+typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy -+typedef float16x8x4_t float16x4x4_t; -+ -+//3 -+struct int16x8x3_t { -+ int16x8_t val[3]; -+}; -+struct int32x4x3_t { -+ int32x4_t val[3]; -+}; -+struct int64x2x3_t { -+ int64x2_t val[3]; -+}; -+struct int8x16x3_t { -+ int8x16_t val[3]; -+}; -+ -+struct int16x4x3_t { -+ int16x4_t val[3]; -+}; -+struct int32x2x3_t { -+ int32x2_t val[3]; -+}; -+struct int64x1x3_t { -+ int64x1_t val[3]; -+}; -+struct int8x8x3_t { -+ int8x8_t val[3]; -+}; -+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy -+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy -+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy -+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy -+ -+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy -+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy -+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy -+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy -+ -+ -+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ -+typedef struct int8x16x3_t uint8x16x3_t; -+typedef struct int16x8x3_t uint16x8x3_t; -+typedef struct int32x4x3_t uint32x4x3_t; -+typedef struct int64x2x3_t uint64x2x3_t; -+typedef struct int8x16x3_t poly8x16x3_t; -+typedef struct int16x8x3_t poly16x8x3_t; -+typedef struct int8x8x3_t uint8x8x3_t; -+typedef struct int16x4x3_t uint16x4x3_t; -+typedef struct int32x2x3_t uint32x2x3_t; -+typedef struct int64x1x3_t uint64x1x3_t; -+typedef struct int8x8x3_t poly8x8x3_t; -+typedef struct int16x4x3_t poly16x4x3_t; -+ -+//float -+struct float32x4x3_t { -+ float32x4_t val[3]; -+}; -+struct float32x2x3_t { -+ float32x2_t val[3]; -+}; -+struct float16x8x3_t { -+ float16x8_t val[3]; -+}; -+ -+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy -+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy -+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy -+typedef float16x8x3_t float16x4x3_t; -+ -+ -+//**************************************************************************** -+//****** Porting auxiliary macros ******************************************** -+ -+//** floating point related macros ** -+#define _M128i(a) _mm_castps_si128(a) -+#define _M128(a) _mm_castsi128_ps(a) -+//here the most performance effective implementation is compiler and 32/64 bits build dependent -+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) ) -+ -+ #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a))) -+ #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp); -+ #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp)); -+#else -+ //for 32bit gcc and Microsoft compilers builds -+ #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a)) -+ #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp) -+ #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp)) -+#endif -+#define _pM128(a) _mm_castsi128_ps(_pM128i(a)) -+ -+#define return64(a) _M64(res64,a); return res64; -+#define return64f(a) _M64f(res64,a); return res64; -+ -+#define _Ui64(a) (*(uint64_t*)&(a)) -+#define _UNSIGNED_T(a) u ## a -+ -+#define _SIGNBIT64 ((uint64_t)1 << 63) -+#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6)) -+#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) ) -+ -+#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it" -+#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it" -+ -+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+#define __constrange(min,max) const -+#define __transfersize(size) -+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ -+//************************************************************************* -+//************************************************************************* -+//********* Functions declarations as declared in original arm_neon.h ***** -+//************************************************************************* -+//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes. -+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 -+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 -+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 -+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 -+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 -+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 -+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 -+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 -+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 -+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 -+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 -+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 -+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 -+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 -+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 -+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 -+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 -+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 -+//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. -+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 -+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 -+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 -+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 -+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0 -+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 -+//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i] -+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 -+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 -+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 -+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 -+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0 -+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 -+//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 -+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 -+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 -+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 -+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0 -+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0 -+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 -+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 -+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0 -+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 -+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 -+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0 -+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 -+//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1 -+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 -+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 -+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 -+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 -+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0 -+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 -+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 -+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 -+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 -+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 -+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0 -+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 -+//Vector saturating add: vqadd -> Vr[i]:=sat(Va[i]+Vb[i]) -+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 -+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 -+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 -+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 -+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 -+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0 -+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 -+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 -+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 -+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 -+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 -+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 -+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 -+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0 -+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 -+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 -+//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i] -+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 -+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 -+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 -+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 -+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 -+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 -+//Vector rounding add high half: vraddhn -+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 -+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 -+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 -+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 -+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 -+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 -+//Multiplication -+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] -+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 -+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 -+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 -+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 -+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 -+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 -+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 -+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 -+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 -+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 -+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 -+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 -+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 -+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 -+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 -+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 -+//multiply lane -+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); -+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); -+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); -+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); -+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); -+int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c); -+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); -+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); -+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); -+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); -+//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] -+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 -+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 -+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 -+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 -+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 -+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 -+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 -+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 -+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 -+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 -+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 -+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 -+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 -+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 -+//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i] -+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 -+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 -+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 -+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 -+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0 -+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 -+//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] -+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 -+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 -+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 -+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 -+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 -+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 -+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 -+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 -+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 -+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 -+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 -+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 -+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 -+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 -+//Vector multiply subtract long -+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 -+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 -+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 -+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 -+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0 -+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 -+//Vector saturating doubling multiply high -+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 -+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 -+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 -+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 -+//Vector saturating rounding doubling multiply high -+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 -+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 -+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 -+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 -+//Vector saturating doubling multiply accumulate long -+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 -+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 -+//Vector saturating doubling multiply subtract long -+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 -+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 -+//Vector long multiply -+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 -+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 -+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 -+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 -+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0 -+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 -+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 -+//Vector saturating doubling long multiply -+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 -+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 -+//Subtraction -+//Vector subtract -+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 -+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 -+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 -+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 -+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 -+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 -+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 -+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 -+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 -+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 -+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 -+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 -+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 -+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 -+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 -+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 -+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 -+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 -+//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i] -+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 -+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 -+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 -+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 -+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0 -+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 -+//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i] -+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 -+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 -+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 -+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 -+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0 -+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 -+//Vector saturating subtract -+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 -+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 -+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 -+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 -+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 -+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0 -+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 -+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 -+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 -+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 -+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 -+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 -+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 -+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0 -+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 -+uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0 -+//Vector halving subtract -+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 -+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 -+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 -+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 -+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0 -+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 -+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 -+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 -+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 -+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 -+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0 -+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 -+//Vector subtract high half -+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 -+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 -+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 -+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 -+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 -+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 -+//Vector rounding subtract high half -+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+//Comparison -+//Vector compare equal -+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 -+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 -+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 -+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 -+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 -+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 -+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 -+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 -+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 -+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 -+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 -+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 -+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 -+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 -+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 -+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 -+//Vector compare greater-than or equal -+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 -+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 -+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 -+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+//Vector compare less-than or equal -+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 -+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 -+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 -+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+//Vector compare greater-than -+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 -+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 -+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+//Vector compare less-than -+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 -+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 -+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+//Vector compare absolute greater-than or equal -+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+//Vector compare absolute less-than or equal -+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+//Vector compare absolute greater-than -+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+//Vector compare absolute less-than -+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+//Vector test bits -+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 -+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 -+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 -+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 -+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 -+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 -+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 -+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 -+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 -+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 -+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 -+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 -+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 -+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 -+//Absolute difference -+//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] | -+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 -+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 -+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 -+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 -+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0 -+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 -+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 -+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 -+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 -+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 -+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 -+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0 -+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 -+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 -+//Absolute difference - long -+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 -+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 -+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 -+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 -+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0 -+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 -+//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | -+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 -+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 -+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 -+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 -+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0 -+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 -+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 -+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 -+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 -+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 -+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0 -+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 -+//Absolute difference and accumulate - long -+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 -+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 -+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 -+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 -+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0 -+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 -+//Max/Min -+//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] -+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 -+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 -+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 -+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 -+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0 -+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 -+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 -+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 -+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 -+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 -+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 -+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 -+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 -+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 -+//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] -+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 -+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 -+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 -+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 -+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0 -+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 -+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 -+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 -+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 -+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 -+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 -+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 -+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 -+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 -+//Pairwise addition -+//Pairwise add -+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 -+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 -+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 -+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 -+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 -+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 -+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 -+//Long pairwise add -+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 -+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 -+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 -+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 -+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0 -+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 -+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 -+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 -+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 -+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 -+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0 -+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 -+//Long pairwise add and accumulate -+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 -+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 -+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 -+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 -+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0 -+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 -+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 -+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 -+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 -+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 -+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0 -+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 -+//Folding maximum vpmax -> takes maximum of adjacent pairs -+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 -+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 -+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 -+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 -+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0 -+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 -+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 -+//Folding minimum vpmin -> takes minimum of adjacent pairs -+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 -+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 -+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 -+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 -+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0 -+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 -+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 -+//Reciprocal/Sqrt -+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 -+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 -+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 -+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 -+//Shifts by signed variable -+//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) -+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 -+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 -+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 -+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 -+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 -+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0 -+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 -+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 -+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 -+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 -+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 -+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 -+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 -+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0 -+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 -+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 -+//Vector saturating shift left: (negative values shift right) -+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 -+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 -+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 -+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 -+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 -+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0 -+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 -+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 -+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 -+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 -+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 -+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 -+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 -+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0 -+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 -+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 -+//Vector rounding shift left: (negative values shift right) -+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 -+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 -+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 -+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 -+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 -+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0 -+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 -+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 -+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 -+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 -+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 -+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 -+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 -+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0 -+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 -+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 -+//Vector saturating rounding shift left: (negative values shift right) -+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 -+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 -+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 -+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 -+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 -+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0 -+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 -+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 -+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 -+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 -+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 -+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 -+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 -+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0 -+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 -+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 -+//Shifts by a constant -+//Vector shift right by constant -+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 -+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 -+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 -+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 -+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 -+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16 -+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 -+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 -+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 -+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 -+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 -+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 -+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 -+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16 -+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 -+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 -+//Vector shift left by constant -+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+//Vector rounding shift right by constant -+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 -+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 -+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 -+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 -+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 -+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16 -+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 -+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 -+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 -+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 -+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 -+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 -+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 -+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16 -+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 -+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 -+//Vector shift right by constant and accumulate -+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 -+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 -+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 -+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 -+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 -+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16 -+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 -+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 -+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 -+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 -+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 -+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 -+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 -+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16 -+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 -+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 -+//Vector rounding shift right by constant and accumulate -+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 -+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 -+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 -+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 -+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 -+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16 -+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 -+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 -+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 -+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 -+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 -+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 -+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 -+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16 -+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 -+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 -+//Vector saturating shift left by constant -+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 -+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 -+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 -+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 -+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 -+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0 -+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 -+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 -+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 -+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 -+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 -+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 -+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 -+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0 -+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 -+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 -+//Vector signed->unsigned saturating shift left by constant -+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 -+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 -+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 -+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 -+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 -+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 -+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 -+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 -+//Vector narrowing shift right by constant -+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+//Vector signed->unsigned narrowing saturating shift right by constant -+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 -+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 -+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 -+//Vector signed->unsigned rounding narrowing saturating shift right by constant -+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 -+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 -+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 -+//Vector narrowing saturating shift right by constant -+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 -+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 -+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 -+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8 -+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 -+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 -+//Vector rounding narrowing shift right by constant -+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+//Vector rounding narrowing saturating shift right by constant -+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 -+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 -+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 -+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8 -+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 -+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 -+//Vector widening shift left by constant -+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 -+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 -+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 -+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 -+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0 -+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 -+//Shifts with insert -+//Vector shift right and insert -+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+//Vector shift left and insert -+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+//Loads of a single vector or lane. Perform loads and stores of a single vector of some type. -+//Load a single vector from memory -+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] -+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] -+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] -+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] -+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] -+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] -+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] -+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] -+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] -+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] -+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] -+//Load a single lane from memory -+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] -+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] -+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] -+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] -+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0] -+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] -+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0] -+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] -+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] -+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] -+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] -+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0] -+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] -+float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] -+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] -+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+//Load all lanes of vector with same value from memory -+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+//Store a single vector or lane. Stores all lanes or a single lane of a vector. -+//Store a single vector into memory -+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] -+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] -+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] -+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] -+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] -+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] -+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] -+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] -+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] -+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] -+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] -+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] -+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] -+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] -+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] -+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] -+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] -+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] -+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] -+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] -+//Store a lane of a vector into memory -+//Loads of an N-element structure -+//Load N-element structure from memory -+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] -+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] -+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+//Load all lanes of N-element structure with same value from memory -+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+//Load a single lane of N-element structure from memory -+//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned -+uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] -+int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] -+float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] -+poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] -+uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] -+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0] -+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0] -+//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] -+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+//Store N-element structure to memory -+void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0] -+void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0] -+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0] -+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0] -+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0] -+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0] -+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0] -+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0] -+void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0] -+void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0] -+//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0] -+void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] -+void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] -+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] -+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] -+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] -+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] -+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] -+void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] -+void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] -+void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] -+void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] -+void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] -+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] -+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] -+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] -+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] -+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] -+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] -+void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] -+void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] -+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] -+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] -+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+//Store a single lane of N-element structure to memory -+void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] -+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] -+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0] -+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] -+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] -+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0] -+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] -+void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] -+void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] -+void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0] -+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector. -+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] -+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] -+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] -+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] -+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] -+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] -+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] -+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] -+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector. -+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+//Initialize a vector from a literal bit pattern. -+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 -+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 -+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 -+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 -+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 -+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 -+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 -+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 -+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 -+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 -+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 -+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 -+//Set all lanes to same value -+//Load all lanes of vector to the same literal value -+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 -+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 -+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 -+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 -+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 -+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 -+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 -+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 -+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 -+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 -+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 -+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 -+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 -+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 -+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 -+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 -+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 -+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 -+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 -+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 -+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 -+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 -+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 -+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 -+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 -+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 -+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 -+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 -+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+//Load all lanes of the vector to the value of a lane of a vector -+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector. -+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 -+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 -+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 -+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 -+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 -+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 -+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 -+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 -+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 -+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 -+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 -+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 -+//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors -+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 -+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 -+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 -+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 -+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 -+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 -+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 -+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 -+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 -+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 -+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 -+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 -+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 -+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 -+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 -+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 -+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 -+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 -+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 -+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 -+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 -+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 -+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 -+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 -+//Converting vectors. These intrinsics are used to convert vectors. -+//Convert from float -+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 -+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 -+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 -+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 -+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 -+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 -+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 -+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 -+//Convert to float -+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 -+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 -+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 -+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 -+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 -+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 -+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 -+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 -+//Convert between floats -+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 -+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 -+//Vector narrow integer -+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 -+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 -+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 -+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 -+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 -+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 -+//Vector long move -+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 -+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 -+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 -+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 -+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0 -+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 -+//Vector saturating narrow integer -+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 -+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 -+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 -+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0 -+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0 -+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0 -+//Vector saturating narrow integer signed->unsigned -+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 -+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 -+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 -+//Table look up -+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 -+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+//Extended table look up intrinsics -+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 -+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+//Operations with a scalar value -+//Vector multiply accumulate with scalar -+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] -+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] -+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] -+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] -+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0] -+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0] -+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0] -+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0] -+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0] -+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0] -+//Vector widening multiply accumulate with scalar -+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0] -+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0] -+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0] -+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0] -+//Vector widening saturating doubling multiply accumulate with scalar -+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0] -+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0] -+//Vector multiply subtract with scalar -+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] -+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] -+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] -+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] -+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0] -+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0] -+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0] -+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0] -+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0] -+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0] -+//Vector widening multiply subtract with scalar -+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0] -+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0] -+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0] -+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0] -+//Vector widening saturating doubling multiply subtract with scalar -+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0] -+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0] -+//Vector multiply by scalar -+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] -+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] -+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] -+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] -+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] -+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] -+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] -+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] -+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] -+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] -+//Vector long multiply with scalar -+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] -+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] -+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0] -+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] -+//Vector long multiply by scalar -+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] -+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] -+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0] -+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] -+//Vector saturating doubling long multiply with scalar -+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] -+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] -+//Vector saturating doubling long multiply by scalar -+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] -+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] -+//Vector saturating doubling multiply high with scalar -+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] -+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] -+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] -+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] -+//Vector saturating doubling multiply high by scalar -+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] -+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] -+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] -+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] -+//Vector saturating rounding doubling multiply high with scalar -+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] -+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] -+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] -+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] -+//Vector rounding saturating doubling multiply high by scalar -+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] -+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] -+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] -+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] -+//Vector multiply accumulate with scalar -+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] -+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] -+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] -+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] -+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] -+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] -+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] -+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] -+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] -+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] -+//Vector widening multiply accumulate with scalar -+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] -+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] -+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0] -+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] -+//Vector widening saturating doubling multiply accumulate with scalar -+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] -+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] -+//Vector multiply subtract with scalar -+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] -+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] -+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] -+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] -+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] -+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] -+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] -+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] -+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] -+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] -+//Vector widening multiply subtract with scalar -+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] -+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] -+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0] -+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] -+//Vector widening saturating doubling multiply subtract with scalar -+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] -+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] -+//Vector extract -+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 -+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 -+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 -+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 -+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 -+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 -+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 -+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 -+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 -+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 -+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 -+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 -+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 -+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 -+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 -+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 -+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 -+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 -+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0 -+//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. -+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0 -+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0 -+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0 -+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0 -+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0 -+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0 -+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0 -+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0 -+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0 -+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 -+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 -+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 -+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 -+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 -+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 -+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 -+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 -+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 -+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0 -+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0 -+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0 -+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0 -+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0 -+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0 -+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 -+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 -+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 -+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 -+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 -+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0 -+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0 -+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0 -+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0 -+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 -+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 -+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 -+//Other single operand arithmetic -+//Absolute: Vd[i] = |Va[i]| -+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0 -+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0 -+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0 -+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0 -+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 -+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 -+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 -+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 -+//Saturating absolute: Vd[i] = sat(|Va[i]|) -+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 -+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 -+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0 -+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 -+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 -+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 -+//Negate: Vd[i] = - Va[i] -+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0 -+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0 -+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0 -+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0 -+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 -+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 -+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 -+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 -+//Saturating Negate: sat(Vd[i] = - Va[i]) -+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0 -+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0 -+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0 -+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 -+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 -+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 -+//Count leading sign bits -+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 -+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 -+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 -+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 -+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 -+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 -+//Count leading zeros -+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0 -+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0 -+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0 -+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0 -+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0 -+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0 -+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 -+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 -+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 -+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 -+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 -+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 -+//Count number of set bits -+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 -+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 -+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 -+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 -+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 -+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 -+//Reciprocal estimate -+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 -+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 -+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 -+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 -+//Reciprocal square root estimate -+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 -+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 -+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 -+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 -+//Logical operations -+//Bitwise not -+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 -+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 -+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 -+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 -+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 -+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 -+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 -+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 -+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 -+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 -+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 -+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 -+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 -+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 -+//Bitwise and -+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 -+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 -+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 -+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 -+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 -+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 -+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 -+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 -+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 -+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 -+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 -+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 -+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 -+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 -+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 -+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 -+//Bitwise or -+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 -+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 -+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 -+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 -+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 -+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 -+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 -+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 -+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 -+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 -+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 -+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 -+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 -+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 -+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 -+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 -+//Bitwise exclusive or (EOR or XOR) -+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 -+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 -+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 -+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 -+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 -+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 -+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 -+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 -+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 -+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 -+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 -+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 -+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 -+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 -+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 -+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 -+//Bit Clear -+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 -+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 -+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 -+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 -+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 -+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 -+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 -+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 -+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 -+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 -+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 -+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 -+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 -+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 -+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 -+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 -+//Bitwise OR complement -+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 -+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 -+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 -+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 -+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 -+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 -+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 -+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 -+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 -+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 -+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 -+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 -+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 -+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 -+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 -+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 -+//Bitwise Select -+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 -+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 -+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 -+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 -+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 -+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 -+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 -+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 -+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 -+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 -+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 -+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 -+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 -+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 -+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 -+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 -+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 -+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 -+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 -+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 -+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 -+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 -+//Transposition operations -+//Transpose elements -+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 -+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 -+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 -+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 -+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 -+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 -+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 -+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 -+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 -+int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 -+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 -+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 -+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 -+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 -+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 -+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 -+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 -+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 -+//Interleave elements -+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 -+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 -+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 -+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 -+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 -+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 -+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 -+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 -+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 -+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 -+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 -+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 -+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 -+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 -+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 -+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 -+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 -+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 -+//De-Interleave elements -+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 -+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 -+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 -+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 -+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 -+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 -+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 -+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 -+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 -+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 -+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 -+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 -+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 -+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 -+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 -+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 -+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 -+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 -+ -+ -+//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -+// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must, -+//for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal -+// -+#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers. -+ -+ #define _MM_ALIGNR_EPI8 _mm_alignr_epi8 -+ -+ #define _MM_EXTRACT_EPI16 _mm_extract_epi16 -+ #define _MM_INSERT_EPI16 _mm_insert_epi16 -+#ifdef USE_SSE4 -+ #define _MM_EXTRACT_EPI8 _mm_extract_epi8 -+ #define _MM_EXTRACT_EPI32 _mm_extract_epi32 -+ #define _MM_EXTRACT_PS _mm_extract_ps -+ -+ #define _MM_INSERT_EPI8 _mm_insert_epi8 -+ #define _MM_INSERT_EPI32 _mm_insert_epi32 -+ #define _MM_INSERT_PS _mm_insert_ps -+#ifdef _NEON2SSE_64BIT -+ #define _MM_INSERT_EPI64 _mm_insert_epi64 -+ #define _MM_EXTRACT_EPI64 _mm_extract_epi64 -+#endif -+#endif //SSE4 -+#else -+ #define _NEON2SSE_COMMA , -+ #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \ -+ switch(LANE) \ -+ { \ -+ case 0: return NAME(a b, 0); \ -+ case 1: return NAME(a b, 1); \ -+ case 2: return NAME(a b, 2); \ -+ case 3: return NAME(a b, 3); \ -+ case 4: return NAME(a b, 4); \ -+ case 5: return NAME(a b, 5); \ -+ case 6: return NAME(a b, 6); \ -+ case 7: return NAME(a b, 7); \ -+ case 8: return NAME(a b, 8); \ -+ case 9: return NAME(a b, 9); \ -+ case 10: return NAME(a b, 10); \ -+ case 11: return NAME(a b, 11); \ -+ case 12: return NAME(a b, 12); \ -+ case 13: return NAME(a b, 13); \ -+ case 14: return NAME(a b, 14); \ -+ case 15: return NAME(a b, 15); \ -+ default: return NAME(a b, 0); \ -+ } -+ -+ #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \ -+ switch(LANE) \ -+ { \ -+ case 0: return NAME(vec p,0); \ -+ case 1: return NAME(vec p,1); \ -+ case 2: return NAME(vec p,2); \ -+ case 3: return NAME(vec p,3); \ -+ case 4: return NAME(vec p,4); \ -+ case 5: return NAME(vec p,5); \ -+ case 6: return NAME(vec p,6); \ -+ case 7: return NAME(vec p,7); \ -+ default: return NAME(vec p,0); \ -+ } -+ -+ #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \ -+ switch(LANE) \ -+ { \ -+ case case0: return NAME(vec p,case0); \ -+ case case1: return NAME(vec p,case1); \ -+ case case2: return NAME(vec p,case2); \ -+ case case3: return NAME(vec p,case3); \ -+ default: return NAME(vec p,case0); \ -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) -+ { -+ _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE) -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p) -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,) -+ } -+ -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,) -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,) -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE) -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p) -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE) -+ } -+ -+#ifdef _NEON2SSE_64BIT -+ //the special case of functions available only for SSE4 and 64-bit build. -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE) -+ { -+ switch(LANE) { -+ case 0: -+ return _mm_insert_epi64(vec, p, 0); -+ case 1: -+ return _mm_insert_epi64(vec, p, 1); -+ default: -+ return _mm_insert_epi64(vec, p, 0); -+ } -+ } -+ -+ _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE) -+ { -+ if (LANE ==0) return _mm_extract_epi64(val, 0); -+ else return _mm_extract_epi64(val, 1); -+ } -+#endif -+ -+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p) -+ } -+ -+#endif //USE_SSE4 -+ -+#endif //#ifdef NDEBUG -+ -+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices -+// or for some specific commonly used operations implementation missing in SSE -+#ifdef USE_SSE4 -+ #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16 -+ #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32 -+ #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64 -+ -+ #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16 -+ #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32 -+ #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64 -+ -+ #define _MM_MAX_EPI8 _mm_max_epi8 -+ #define _MM_MAX_EPI32 _mm_max_epi32 -+ #define _MM_MAX_EPU16 _mm_max_epu16 -+ #define _MM_MAX_EPU32 _mm_max_epu32 -+ -+ #define _MM_MIN_EPI8 _mm_min_epi8 -+ #define _MM_MIN_EPI32 _mm_min_epi32 -+ #define _MM_MIN_EPU16 _mm_min_epu16 -+ #define _MM_MIN_EPU32 _mm_min_epu32 -+ -+ #define _MM_BLENDV_EPI8 _mm_blendv_epi8 -+ #define _MM_PACKUS_EPI32 _mm_packus_epi32 -+ #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a) -+ -+ #define _MM_MULLO_EPI32 _mm_mullo_epi32 -+ #define _MM_MUL_EPI32 _mm_mul_epi32 -+ -+ #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64 -+#else //no SSE4 !!!!!! -+ _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ return _mm_unpacklo_epi8(a, zero); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ return _mm_unpacklo_epi16(a, zero); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ return _mm_unpacklo_epi32(a, zero); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ __m128i sign = _mm_cmpgt_epi8(zero, a); -+ return _mm_unpacklo_epi8(a, sign); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ __m128i sign = _mm_cmpgt_epi16(zero, a); -+ return _mm_unpacklo_epi16(a, sign); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ __m128i sign = _mm_cmpgt_epi32(zero, a); -+ return _mm_unpacklo_epi32(a, sign); -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t tmp[4]; -+ _mm_store_si128((__m128i*)tmp, vec); -+ return tmp[LANE]; -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int8_t tmp[16]; -+ _mm_store_si128((__m128i*)tmp, vec); -+ return (int)tmp[LANE]; -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t tmp[4]; -+ _mm_store_si128((__m128i*)tmp, _M128i(vec)); -+ return tmp[LANE]; -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0}; -+ _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; -+ __m128i vec_masked, p_masked; -+ pvec[LANE] = p; -+ mask[LANE] = 0x0; -+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p -+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec -+ return _mm_or_si128(vec_masked, p_masked); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; -+ _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; -+ __m128i vec_masked, p_masked; -+ pvec[LANE] = (int8_t)p; -+ mask[LANE] = 0x0; -+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p -+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec -+ return _mm_or_si128(vec_masked, p_masked); -+ } -+ -+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; -+ __m128 tmp, vec_masked, p_masked; -+ mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it -+ vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p -+ p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec -+ tmp = _mm_or_ps(vec_masked, p_masked); -+ return tmp; -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi8 (a, b); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi32(a, b); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b) -+ { -+ __m128i c8000, b_s, a_s, cmp; -+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff -+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 -+ b_s = _mm_sub_epi16 (b, c8000); -+ a_s = _mm_sub_epi16 (a, c8000); -+ cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b) -+ { -+ __m128i c80000000, b_s, a_s, cmp; -+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff -+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 -+ b_s = _mm_sub_epi32 (b, c80000000); -+ a_s = _mm_sub_epi32 (a, c80000000); -+ cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi8 (b, a); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi32(b, a); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b) -+ { -+ __m128i c8000, b_s, a_s, cmp; -+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff -+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 -+ b_s = _mm_sub_epi16 (b, c8000); -+ a_s = _mm_sub_epi16 (a, c8000); -+ cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b) -+ { -+ __m128i c80000000, b_s, a_s, cmp; -+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff -+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 -+ b_s = _mm_sub_epi32 (b, c80000000); -+ a_s = _mm_sub_epi32 (a, c80000000); -+ cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below -+ { -+ //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters. -+ __m128i a_masked, b_masked; -+ b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff -+ a_masked = _mm_andnot_si128 (mask,a); -+ return _mm_or_si128(a_masked, b_masked); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b) -+ { -+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; -+ __m128i a16, b16, res, reshi,cmp, zero; -+ zero = _mm_setzero_si128(); -+ a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); -+ b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); -+ res = _mm_unpacklo_epi64(a16, b16); //result without saturation -+ reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation -+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero -+ res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 -+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive -+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a) -+ { -+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; -+ __m128i a16, res, reshi,cmp, zero; -+ zero = _mm_setzero_si128(); -+ a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); -+ reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation -+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero -+ res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 -+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive -+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff -+ } -+ -+ -+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL) -+ { -+ _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4]; -+ int64_t res64; -+ int i; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ for (i = 0; i<4; i++) { -+ res64 = atmp[i] * btmp[i]; -+ res[i] = (int)(res64 & 0xffffffff); -+ } -+ return _mm_load_si128((__m128i*)res); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b) -+ { -+ __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg; -+ sign = _mm_xor_si128 (a, b); -+ sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive -+ zero = _mm_setzero_si128(); -+ a_neg = _mm_abs_epi32 (a); //negate a and b -+ b_neg = _mm_abs_epi32 (b); //negate a and b -+ mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result -+ mul_us_neg = _mm_sub_epi64(zero, mul_us); -+ mul_us_neg = _mm_and_si128(sign, mul_us_neg); -+ mul_us = _mm_andnot_si128(sign, mul_us); -+ return _mm_or_si128 (mul_us, mul_us_neg); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b) -+ { -+ __m128i res; -+ res = _mm_cmpeq_epi32 (a, b); -+ return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data -+ } -+#endif //SSE4 -+ -+//the special case of functions working only for 32 bits, no SSE4 -+_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0}; -+ _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff}; -+ __m128i vec_masked, p_masked; -+ pvec[LANE] = p; -+ mask[LANE] = 0x0; -+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p -+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec -+ return _mm_or_si128(vec_masked, p_masked); -+} -+ -+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE) -+{ -+ _NEON2SSE_ALIGN_16 int64_t tmp[2]; -+ _mm_store_si128((__m128i*)tmp, val); -+ return tmp[LANE]; -+} -+ -+#ifndef _NEON2SSE_64BIT_SSE4 -+ #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32 -+ #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32 -+#endif -+ -+int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints -+_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a) -+{ -+ //Overflow happens only if a and sum have the opposite signs -+ __m128i c7fffffff, res, res_sat, res_xor_a; -+ c7fffffff = _mm_set1_epi32(0x7fffffff); -+ res = _mm_slli_epi32 (a, 1); // res = a*2 -+ res_sat = _mm_srli_epi32(a, 31); -+ res_sat = _mm_add_epi32(res_sat, c7fffffff); -+ res_xor_a = _mm_xor_si128(res, a); -+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise -+ res_sat = _mm_and_si128(res_xor_a, res_sat); -+ res = _mm_andnot_si128(res_xor_a, res); -+ return _mm_or_si128(res, res_sat); -+} -+ -+ -+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -+//************************************************************************* -+//************************************************************************* -+//***************** Functions redefinition\implementatin starts here ***** -+//************************************************************************* -+//************************************************************************* -+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -+ -+/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample: -+#ifdef ARM -+#define vector_addq_s32 _mm_add_epi32 -+#else //if we have IA -+#define vector_addq_s32 vadd_s32 -+#endif -+ -+******************************************************************************************** -+Functions below are organised in the following way: -+ -+Each NEON intrinsic function has one of the following options: -+1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement -+2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement -+3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition -+4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance -+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path -+- please consider such functions removal from your code. -+*/ -+ -+//*********************************************************************** -+//************************ Vector add ***************************** -+//*********************************************************************** -+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_add_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_add_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_add_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res64; -+ res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0]; -+ return res64; -+} -+ -+ -+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b) -+{ -+ __m128 res; -+ __m64_128 res64; -+ res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 -+#define vadd_u8 vadd_s8 -+ -+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 -+#define vadd_u16 vadd_s16 -+ -+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 -+#define vadd_u32 vadd_s32 -+ -+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 -+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) -+{ -+ uint64x1_t res64; -+ res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0]; -+ return res64; -+} -+ -+ -+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 -+#define vaddq_s8 _mm_add_epi8 -+ -+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 -+#define vaddq_s16 _mm_add_epi16 -+ -+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 -+#define vaddq_s32 _mm_add_epi32 -+ -+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 -+#define vaddq_s64 _mm_add_epi64 -+ -+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 -+#define vaddq_f32 _mm_add_ps -+ -+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 -+#define vaddq_u8 _mm_add_epi8 -+ -+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 -+#define vaddq_u16 _mm_add_epi16 -+ -+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 -+#define vaddq_u32 _mm_add_epi32 -+ -+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 -+#define vaddq_u64 _mm_add_epi64 -+ -+//**************************** Vector long add *****************************: -+//*********************************************************************** -+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. -+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_add_epi16 (a16, b16); -+} -+ -+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi32 (a32, b32); -+} -+ -+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 ( a64, b64); -+} -+ -+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1 -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi16 (a16, b16); -+} -+ -+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi32 (a32, b32); -+} -+ -+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 (a64, b64); -+} -+ -+//*************** Vector wide add: vaddw_. Vr[i]:=Va[i]+Vb[i] ****************** -+//*************** ********************************************************************* -+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 -+_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_add_epi16 (a, b16); -+} -+ -+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 -+_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1, -+ return _mm_add_epi32 (a, b32); -+} -+ -+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 -+_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 (a, b64); -+} -+ -+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 -+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi16 (a, b16); -+} -+ -+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0 -+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi32 (a, b32); -+} -+ -+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 -+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 (a, b64); -+} -+ -+//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated ******************************* -+//************************************************************************************************************************* -+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vhaddq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64( vhaddq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64( vhaddq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64( vhaddq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64( vhaddq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64( vhaddq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b) -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = vshrq_n_s8(tmp2,1); -+ return _mm_add_epi8(tmp1,tmp2); -+} -+ -+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b) -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = _mm_srai_epi16(tmp2,1); -+ return _mm_add_epi16(tmp1,tmp2); -+} -+ -+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0 -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = _mm_srai_epi32(tmp2,1); -+ return _mm_add_epi32(tmp1,tmp2); -+} -+ -+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0 -+{ -+ __m128i c1, sum, res; -+ c1 = _mm_set1_epi8(1); -+ sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it -+ res = _mm_xor_si128(a, b); //for rounding compensation -+ res = _mm_and_si128(res,c1); //for rounding compensation -+ return _mm_sub_epi8 (sum, res); //actual rounding compensation -+} -+ -+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0 -+{ -+ __m128i sum, res; -+ sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it -+ res = _mm_xor_si128(a, b); //for rounding compensation -+ res = _mm_slli_epi16 (res,15); //shift left then back right to -+ res = _mm_srli_epi16 (res,15); //get 1 or zero -+ return _mm_sub_epi16 (sum, res); //actual rounding compensation -+} -+ -+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0 -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = _mm_srli_epi32(tmp2,1); -+ return _mm_add_epi32(tmp1,tmp2); -+} -+ -+//************************Vector rounding halving add: vrhadd{q}_. Vr[i]:=(Va[i]+Vb[i]+1)>>1 *************************** -+//***************************************************************************************************************************** -+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vrhaddq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vrhaddq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vrhaddq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! -+} -+ -+ -+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! -+} -+ -+ -+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vrhaddq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0 -+{ -+ //no signed average in x86 SIMD, go to unsigned -+ __m128i c128, au, bu, sum; -+ c128 = _mm_set1_epi8(0x80); //-128 -+ au = _mm_sub_epi8(a, c128); //add 128 -+ bu = _mm_sub_epi8(b, c128); //add 128 -+ sum = _mm_avg_epu8(au, bu); -+ return _mm_add_epi8 (sum, c128); //sub 128 -+} -+ -+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0 -+{ -+ //no signed average in x86 SIMD, go to unsigned -+ __m128i cx8000, au, bu, sum; -+ cx8000 = _mm_set1_epi16(0x8000); // - 32768 -+ au = _mm_sub_epi16(a, cx8000); //add 32768 -+ bu = _mm_sub_epi16(b, cx8000); //add 32768 -+ sum = _mm_avg_epu16(au, bu); -+ return _mm_add_epi16 (sum, cx8000); //sub 32768 -+} -+ -+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b) -+{ -+ //need to avoid overflow -+ __m128i a2, b2, res, sum; -+ a2 = _mm_srai_epi32(a,1); //a2=a/2; -+ b2 = _mm_srai_epi32(b,1); // b2=b/2; -+ res = _mm_or_si128(a,b); //for rounding -+ res = _mm_slli_epi32 (res,31); //shift left then back right to -+ res = _mm_srli_epi32 (res,31); //get 1 or zero -+ sum = _mm_add_epi32(a2,b2); -+ return _mm_add_epi32(sum,res); -+} -+ -+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 -+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded -+ -+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0 -+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded -+ -+ -+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0 -+{ -+ //need to avoid overflow -+ __m128i a2, b2, res, sum; -+ a2 = _mm_srli_epi32(a,1); //a2=a/2; -+ b2 = _mm_srli_epi32(b,1); // b2=b/2; -+ res = _mm_or_si128(a,b); //for rounding -+ res = _mm_slli_epi32 (res,31); //shift left then back right to -+ res = _mm_srli_epi32 (res,31); //get 1 or zero -+ sum = _mm_add_epi32(a2,b2); -+ return _mm_add_epi32(sum,res); -+} -+ -+//****************** VQADD: Vector saturating add ************************ -+//************************************************************************ -+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_adds_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_adds_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vqaddq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int64x1_t res; -+ uint64_t a64, b64; -+ a64 = a.m64_u64[0]; -+ b64 = b.m64_u64[0]; -+ res.m64_u64[0] = a64 + b64; -+ a64 = (a64 >> 63) + (~_SIGNBIT64); -+ if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) { -+ res.m64_u64[0] = a64; -+ } -+ return res; -+} -+ -+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_adds_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_adds_epu16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vqaddq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t a64, b64; -+ uint64x1_t res; -+ a64 = a.m64_u64[0]; -+ b64 = b.m64_u64[0]; -+ res.m64_u64[0] = a64 + b64; -+ if (res.m64_u64[0] < a64) { -+ res.m64_u64[0] = ~(uint64_t)0; -+ } -+ return res; -+} -+ -+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 -+#define vqaddq_s8 _mm_adds_epi8 -+ -+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 -+#define vqaddq_s16 _mm_adds_epi16 -+ -+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b) -+{ -+ //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign -+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_; -+ c7fffffff = _mm_set1_epi32(0x7fffffff); -+ res = _mm_add_epi32(a, b); -+ res_sat = _mm_srli_epi32(a, 31); -+ res_sat = _mm_add_epi32(res_sat, c7fffffff); -+ res_xor_a = _mm_xor_si128(res, a); -+ b_xor_a_ = _mm_xor_si128(b, a); -+ res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a); -+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise -+ res_sat = _mm_and_si128(res_xor_a, res_sat); -+ res = _mm_andnot_si128(res_xor_a, res); -+ return _mm_or_si128(res, res_sat); -+} -+ -+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = atmp[0] + btmp[0]; -+ res[1] = atmp[1] + btmp[1]; -+ -+ atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64); -+ atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64); -+ -+ if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) { -+ res[0] = atmp[0]; -+ } -+ if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) { -+ res[1] = atmp[1]; -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 -+#define vqaddq_u8 _mm_adds_epu8 -+ -+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0 -+#define vqaddq_u16 _mm_adds_epu16 -+ -+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b) -+{ -+ __m128i c80000000, cmp, subsum, suba, sum; -+ c80000000 = _mm_set1_epi32 (0x80000000); -+ sum = _mm_add_epi32 (a, b); -+ subsum = _mm_sub_epi32 (sum, c80000000); -+ suba = _mm_sub_epi32 (a, c80000000); -+ cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed -+ return _mm_or_si128 (sum, cmp); //saturation -+} -+ -+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b) -+ { -+ __m128i c80000000, sum, cmp, suba, subsum; -+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); -+ sum = _mm_add_epi64 (a, b); -+ subsum = _mm_sub_epi64 (sum, c80000000); -+ suba = _mm_sub_epi64 (a, c80000000); -+ cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!! -+ return _mm_or_si128 (sum, cmp); //saturation -+ } -+#else -+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+ { -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = atmp[0] + btmp[0]; -+ res[1] = atmp[1] + btmp[1]; -+ if (res[0] < atmp[0]) res[0] = ~(uint64_t)0; -+ if (res[1] < atmp[1]) res[1] = ~(uint64_t)0; -+ return _mm_load_si128((__m128i*)(res)); -+ } -+#endif -+ -+ -+//******************* Vector add high half (truncated) ****************** -+//************************************************************************ -+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sum; -+ sum = _mm_add_epi16 (a, b); -+ sum = _mm_srai_epi16 (sum, 8); -+ sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only -+ return64(sum); -+} -+ -+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0 -+{ -+ int16x4_t res64; -+ __m128i sum; -+ sum = _mm_add_epi32 (a, b); -+ sum = _mm_srai_epi32(sum, 16); -+ sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only -+ return64(sum); -+} -+ -+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b) -+{ -+ int32x2_t res64; -+ __m128i sum; -+ sum = _mm_add_epi64 (a, b); -+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6)); -+ return64(sum); -+} -+ -+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sum; -+ sum = _mm_add_epi16 (a, b); -+ sum = _mm_srli_epi16 (sum, 8); -+ sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only -+ return64(sum); -+} -+ -+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0 -+{ -+ uint16x4_t res64; -+ __m128i sum; -+ sum = _mm_add_epi32 (a, b); -+ sum = _mm_srli_epi32 (sum, 16); -+ sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only -+ return64(sum); -+} -+ -+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 -+#define vaddhn_u64 vaddhn_s64 -+ -+//*********** Vector rounding add high half: vraddhn_ ******************. -+//*************************************************************************** -+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sum = _mm_srai_epi16 (sum, 8); //get high half -+ sum = _mm_add_epi16 (sum, mask1); //actual rounding -+ sum = _mm_packs_epi16 (sum, sum); -+ return64(sum); -+} -+ -+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0 -+{ -+ //SIMD may be not optimal, serial may be faster -+ int16x4_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sum = _mm_srai_epi32 (sum, 16); //get high half -+ sum = _mm_add_epi32 (sum, mask1); //actual rounding -+ sum = _mm_packs_epi32 (sum, sum); -+ return64(sum); -+} -+ -+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b) -+{ -+ //SIMD may be not optimal, serial may be faster -+ int32x2_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi64 (a, b); -+ mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero -+ sum = _mm_add_epi64 (sum, mask1); //actual high half rounding -+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6)); -+ return64(sum); -+} -+ -+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sum = _mm_srai_epi16 (sum, 8); //get high half -+ sum = _mm_add_epi16 (sum, mask1); //actual rounding -+ sum = _mm_packus_epi16 (sum, sum); -+ return64(sum); -+} -+ -+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b) -+{ -+ //SIMD may be not optimal, serial may be faster -+ uint16x4_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sum = _mm_srai_epi32 (sum, 16); //get high half -+ sum = _mm_add_epi32 (sum, mask1); //actual rounding -+ sum = _MM_PACKUS1_EPI32 (sum); -+ return64(sum); -+} -+ -+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 -+#define vraddhn_u64 vraddhn_s64 -+ -+//********************************************************************************** -+//********* Multiplication ************************************* -+//************************************************************************************** -+ -+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] -+//As we don't go to wider result functions are equal to "multiply low" in x86 -+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits in SSE -+ int8x8_t res64; -+ __m128i a128, b128, res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits -+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits -+ res = _mm_mullo_epi16 (a128, b128); -+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only -+ return64(res); -+} -+ -+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 -+#define vmul_s16 vmul_u16 -+ -+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 -+#define vmul_s32 vmul_u32 -+ -+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x4_t tmp; -+ __m64_128 res64; -+ tmp = _mm_mul_ps(_pM128(a),_pM128(b)); -+ _M64f(res64, tmp); //use low 64 bits -+ return res64; -+} -+ -+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits in SSE -+ uint8x8_t res64; -+ __m128i mask, a128, b128, res; -+ mask = _mm_set1_epi16(0xff); -+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); -+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); -+ res = _mm_mullo_epi16 (a128, b128); -+ res = _mm_and_si128(res, mask); //to avoid saturation -+ res = _mm_packus_epi16 (res,res); //use only low 64 bits -+ return64(res); -+} -+ -+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0]; -+ res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1]; -+ return res; -+} -+ -+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 -+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b) -+{ -+ //may be optimized -+ poly8x8_t res64; -+ __m128i a64, b64, c1, res, tmp, bmasked; -+ int i; -+ a64 = _pM128i(a); -+ b64 = _pM128i(b); -+ c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff.... -+ c1 = vshrq_n_u8(c1,7); //0x1 -+ bmasked = _mm_and_si128(b64, c1); //0x1 -+ res = vmulq_u8(a64, bmasked); -+ for(i = 1; i<8; i++) { -+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here -+ bmasked = _mm_and_si128(b64, c1); //0x1 -+ tmp = vmulq_u8(a64, bmasked); -+ res = _mm_xor_si128(res, tmp); -+ } -+ return64 (res); -+} -+ -+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits -+ //solution may be not optimal -+ __m128i a16, b16, r16_1, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (a16, b16); -+ //swap hi and low part of a and b to process the remaining data -+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2 -+ -+ r16_2 = _mm_mullo_epi16 (a16, b16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit -+ -+ return _mm_unpacklo_epi64(r16_1, r16_2); -+} -+ -+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 -+#define vmulq_s16 _mm_mullo_epi16 -+ -+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 -+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1 -+ -+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 -+#define vmulq_f32 _mm_mul_ps -+ -+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits -+ //solution may be not optimal -+ __m128i maskff, a16, b16, r16_1, r16_2; -+ maskff = _mm_set1_epi16(0xff); -+ a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1 -+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (a16, b16); -+ r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation -+ //swap hi and low part of a and b to process the remaining data -+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (a16, b16); -+ r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation -+ return _mm_packus_epi16 (r16_1, r16_2); -+} -+ -+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 -+#define vmulq_u16 _mm_mullo_epi16 -+ -+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 -+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1 -+ -+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 -+_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b) -+{ -+ //may be optimized -+ __m128i c1, res, tmp, bmasked; -+ int i; -+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... -+ c1 = vshrq_n_u8(c1,7); //0x1 -+ bmasked = _mm_and_si128(b, c1); //0x1 -+ res = vmulq_u8(a, bmasked); -+ for(i = 1; i<8; i++) { -+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here -+ bmasked = _mm_and_si128(b, c1); //0x1 -+ tmp = vmulq_u8(a, bmasked); -+ res = _mm_xor_si128(res, tmp); -+ } -+ return res; -+} -+ -+//************************* Vector long multiply *********************************** -+//**************************************************************************** -+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0 -+{ -+ //no 8 bit simd multiply, need to go to 16 bits -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 -+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit -+} -+ -+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0 -+{ -+ #ifdef USE_SSE4 -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1 -+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 -+ #else -+ __m128i low, hi, a128,b128; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ low = _mm_mullo_epi16(a128,b128); -+ hi = _mm_mulhi_epi16(a128,b128); -+ return _mm_unpacklo_epi16(low,hi); -+ #endif -+} -+ -+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0 -+{ -+ __m128i ab, ba, a128, b128; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 -+ return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+} -+ -+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0 -+{ -+ //no 8 bit simd multiply, need to go to 16 bits -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 -+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit -+} -+ -+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0 -+{ -+ #ifdef USE_SSE4 -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1 -+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 -+ #else -+ __m128i a128,b128,low, hi; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ low = _mm_mullo_epi16(a128,b128); -+ hi = _mm_mulhi_epu16(a128,b128); -+ return _mm_unpacklo_epi16(low,hi); -+ #endif -+} -+ -+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0 -+{ -+ ///may be not optimal compared with serial implementation -+ __m128i ab, ba, a128, b128; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 -+ return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+} -+ -+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 -+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) -+{ -+ //may be optimized -+ __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked; -+ int i; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff.... -+ c1 = vshrq_n_u8(c1,7); //0x1 -+ bmasked = _mm_and_si128(b128, c1); //0x1 -+ -+ a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1 -+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 -+ res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit -+ for(i = 1; i<8; i++) { -+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here -+ bmasked = _mm_and_si128(b128, c1); //0x1 -+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 -+ tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked); -+ res = _mm_xor_si128(res, tmp); -+ } -+ return res; -+} -+ -+//****************Vector saturating doubling long multiply ************************** -+//***************************************************************** -+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b) -+{ -+ //the serial soulution may be faster due to saturation -+ __m128i res; -+ res = vmull_s16(a, b); -+ return vqd_s32(res); -+} -+ -+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //the serial soulution may be faster due to saturation -+ __m128i res; -+ res = vmull_s32(a,b); -+ return vqaddq_s64(res,res); //slow serial function!!!! -+} -+ -+//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************ -+//****************************************************************************************** -+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0 -+{ -+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits -+ int8x8_t res64; -+ __m128i b128, c128, res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits -+ c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits -+ res = _mm_mullo_epi16 (c128, b128); -+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); -+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits -+ return64(res); -+} -+ -+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) -+{ -+ int16x4_t res64; -+ return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); -+} -+ -+ -+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0 -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1 -+ res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits -+ return64(res); -+} -+ -+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) -+{ -+ //fma is coming soon, but right now: -+ __m128 res; -+ __m64_128 res64; -+ res = _mm_mul_ps (_pM128(c), _pM128(b)); -+ res = _mm_add_ps (_pM128(a), res); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0 -+{ -+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits -+ uint8x8_t res64; -+ __m128i mask, b128, c128, res; -+ mask = _mm_set1_epi16(0xff); -+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits -+ c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits -+ res = _mm_mullo_epi16 (c128, b128); -+ res = _mm_and_si128(res, mask); //to avoid saturation -+ res = _mm_packus_epi16 (res, res); -+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits -+ return64(res); -+} -+ -+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 -+#define vmla_u16 vmla_s16 -+ -+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 -+#define vmla_u32 vmla_s32 -+ -+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2,r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits -+ r16_1 = _mm_add_epi8 (r16_1, a); -+ //swap hi and low part of a, b and c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_add_epi8(r16_2, a_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_mullo_epi16 (c, b); -+ return _mm_add_epi16 (res, a); -+} -+ -+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0 -+{ -+ __m128i res; -+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 -+ return _mm_add_epi32 (res, a); -+} -+ -+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0 -+{ -+ //fma is coming soon, but right now: -+ __m128 res; -+ res = _mm_mul_ps (c, b); -+ return _mm_add_ps (a, res); -+} -+ -+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits -+ r16_1 = _mm_add_epi8 (r16_1, a); -+ //swap hi and low part of a, b and c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_add_epi8(r16_2, a_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 -+#define vmlaq_u16 vmlaq_s16 -+ -+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 -+#define vmlaq_u32 vmlaq_s32 -+ -+//********************** Vector widening multiply accumulate (long multiply accumulate): -+// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************** -+//******************************************************************************************** -+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0 -+{ -+ int16x8_t res; -+ res = vmull_s8(b, c); -+ return _mm_add_epi16 (res, a); -+} -+ -+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int32x4_t res; -+ res = vmull_s16(b, c); -+ return _mm_add_epi32 (res, a); -+} -+ -+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_s32( b, c); -+ return _mm_add_epi64 (res, a); -+} -+ -+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0 -+{ -+ uint16x8_t res; -+ res = vmull_u8(b, c); -+ return _mm_add_epi16 (res, a); -+} -+ -+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ uint32x4_t res; -+ res = vmull_u16(b, c); -+ return _mm_add_epi32 (res, a); -+} -+ -+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_u32( b,c); -+ return _mm_add_epi64 (res, a); -+} -+ -+//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] *************************************** -+//******************************************************************************************** -+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits -+ int8x8_t res64; -+ __m128i res; -+ res64 = vmul_s8(b,c); -+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); -+ return64(res); -+} -+ -+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) -+{ -+ int16x4_t res64; -+ return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); -+} -+ -+ -+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0 -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1 -+ res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only -+ return64(res); -+} -+ -+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) -+{ -+ __m128 res; -+ __m64_128 res64; -+ res = _mm_mul_ps (_pM128(c), _pM128(b)); -+ res = _mm_sub_ps (_pM128(a), res); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) -+{ -+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits -+ uint8x8_t res64; -+ __m128i res; -+ res64 = vmul_u8(b,c); -+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); -+ return64(res); -+} -+ -+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 -+#define vmls_u16 vmls_s16 -+ -+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 -+#define vmls_u32 vmls_s32 -+ -+ -+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); -+ r16_1 = _mm_sub_epi8 (a, r16_1); -+ //swap hi and low part of a, b, c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_sub_epi8 (a_2, r16_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_mullo_epi16 (c, b); -+ return _mm_sub_epi16 (a, res); -+} -+ -+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0 -+{ -+ __m128i res; -+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 -+ return _mm_sub_epi32 (a, res); -+} -+ -+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0 -+{ -+ __m128 res; -+ res = _mm_mul_ps (c, b); -+ return _mm_sub_ps (a, res); -+} -+ -+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits -+ r16_1 = _mm_sub_epi8 (a, r16_1); -+ //swap hi and low part of a, b and c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_sub_epi8(a_2, r16_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 -+#define vmlsq_u16 vmlsq_s16 -+ -+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 -+#define vmlsq_u32 vmlsq_s32 -+ -+//******************** Vector multiply subtract long (widening multiply subtract) ************************************ -+//************************************************************************************************************* -+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0 -+{ -+ int16x8_t res; -+ res = vmull_s8(b, c); -+ return _mm_sub_epi16 (a, res); -+} -+ -+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int32x4_t res; -+ res = vmull_s16(b, c); -+ return _mm_sub_epi32 (a, res); -+} -+ -+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_s32( b,c); -+ return _mm_sub_epi64 (a, res); -+} -+ -+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0 -+{ -+ uint16x8_t res; -+ res = vmull_u8(b, c); -+ return _mm_sub_epi16 (a, res); -+} -+ -+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ uint32x4_t res; -+ res = vmull_u16(b, c); -+ return _mm_sub_epi32 (a, res); -+} -+ -+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_u32( b,c); -+ return _mm_sub_epi64 (a, res); -+} -+ -+//****** Vector saturating doubling multiply high ********************** -+//************************************************************************* -+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int16x4_t res; -+ int32_t a32, b32, i; -+ for (i = 0; i<4; i++) { -+ a32 = (int32_t) a.m64_i16[i]; -+ b32 = (int32_t) b.m64_i16[i]; -+ a32 = (a32 * b32) >> 15; -+ res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32; -+ } -+ return res; -+} -+ -+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster -+{ -+ //may be not optimal compared with a serial solution -+ int32x2_t res64; -+ __m128i mask; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ int64x2_t mul; -+ mul = vmull_s32(a,b); -+ mul = _mm_slli_epi64(mul,1); //double the result -+ //at this point start treating 2 64-bit numbers as 4 32-bit -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+ return64(mul); -+} -+ -+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0 -+{ -+ __m128i res, res_lo, mask; -+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; -+ res = _mm_mulhi_epi16 (a, b); -+ res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation -+ res_lo = _mm_mullo_epi16 (a, b); -+ res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit -+ res = _mm_add_epi16(res, res_lo); //combine results -+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); -+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 -+} -+ -+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target -+ __m128i ab, ba, mask, mul, mul1; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 -+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul = _mm_slli_epi64(mul,1); //double the result -+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 -+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 -+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul1 = _mm_slli_epi64(mul1,1); //double the result -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits -+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits -+ mul = _mm_unpacklo_epi64(mul, mul1); -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+} -+ -+//********* Vector saturating rounding doubling multiply high **************** -+//**************************************************************************** -+//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order -+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //may be not optimal compared with a serial solution -+ int32x2_t res64; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ __m128i res_sat, mask, mask1; -+ int64x2_t mul; -+ mul = vmull_s32(a,b); -+ res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered -+ mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero -+ mul = _mm_add_epi32 (res_sat, mask1); //actual rounding -+ //at this point start treating 2 64-bit numbers as 4 32-bit -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+ return64(mul); -+} -+ -+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0 -+{ -+ __m128i mask, res; -+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; -+ res = _mm_mulhrs_epi16 (a, b); -+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); -+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 -+} -+ -+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target -+ __m128i ab, ba, mask, mul, mul1, mask1; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 -+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered -+ mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero -+ mul = _mm_add_epi32 (mul, mask1); //actual rounding -+ -+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 -+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 -+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered -+ mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero -+ mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding -+ //at this point start treating 2 64-bit numbers as 4 32-bit -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit -+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit -+ mul = _mm_unpacklo_epi64(mul, mul1); -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+} -+ -+//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) ***** -+//************************************************************************************************************************* -+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0 -+{ -+ //not optimal SIMD soulution, serial may be faster -+ __m128i res32; -+ res32 = vmull_s16(b, c); -+ res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1); -+ return vqaddq_s32(res32, a); //saturation -+} -+ -+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res64; -+ res64 = vmull_s32(b,c); -+ res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1); -+ return vqaddq_s64(res64, a); //saturation -+} -+ -+//************************************************************************************ -+//****************** Vector subtract *********************************************** -+//************************************************************************************ -+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_sub_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_sub_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_sub_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res64; -+ res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0]; -+ return res64; -+} -+ -+ -+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0]; -+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1]; -+ return res; -+} -+ -+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 -+#define vsub_u8 vsub_s8 -+ -+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 -+#define vsub_u16 vsub_s16 -+ -+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 -+#define vsub_u32 vsub_s32 -+ -+ -+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 -+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b) -+{ -+ int64x1_t res64; -+ res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0]; -+ return res64; -+} -+ -+ -+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 -+#define vsubq_s8 _mm_sub_epi8 -+ -+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 -+#define vsubq_s16 _mm_sub_epi16 -+ -+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 -+#define vsubq_s32 _mm_sub_epi32 -+ -+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 -+#define vsubq_s64 _mm_sub_epi64 -+ -+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 -+#define vsubq_f32 _mm_sub_ps -+ -+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 -+#define vsubq_u8 _mm_sub_epi8 -+ -+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 -+#define vsubq_u16 _mm_sub_epi16 -+ -+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 -+#define vsubq_u32 _mm_sub_epi32 -+ -+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 -+#define vsubq_u64 _mm_sub_epi64 -+ -+//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ****************** -+//*********************************************************************************** -+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. -+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a16, b16); -+} -+ -+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a32, b32); -+} -+ -+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi64 (a64, b64); -+} -+ -+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a16, b16); -+} -+ -+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a32, b32); -+} -+ -+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi64 (a64, b64); -+} -+ -+//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ********************************** -+//***************************************************************************************************** -+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 -+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a, b16); -+} -+ -+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 -+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a, b32); -+} -+ -+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 -+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_sub_epi64 (a, b64); -+} -+ -+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 -+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a, b16); -+} -+ -+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0 -+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a, b32); -+} -+ -+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 -+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_sub_epi64 (a, b64); -+} -+ -+//************************Vector saturating subtract ********************************* -+//************************************************************************************* -+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_subs_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_subs_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vqsubq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution -+{ -+ uint64x1_t res; -+ uint64_t a64,b64; -+ a64 = a.m64_u64[0]; -+ b64 = b.m64_u64[0]; -+ res.m64_u64[0] = a64 - b64; -+ -+ a64 = (a64 >> 63) + (~_SIGNBIT64); -+ if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) { -+ res.m64_u64[0] = a64; -+ } -+ return res; -+} -+ -+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_subs_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_subs_epu16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vqsubq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint64x1_t res; -+ uint64_t a64, b64; -+ a64 = _Ui64(a); -+ b64 = _Ui64(b); -+ if (a64 > b64) { -+ res.m64_u64[0] = a64 - b64; -+ } else { -+ res.m64_u64[0] = 0; -+ } -+ return res; -+} -+ -+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 -+#define vqsubq_s8 _mm_subs_epi8 -+ -+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 -+#define vqsubq_s16 _mm_subs_epi16 -+ -+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b) -+{ -+ //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a -+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a; -+ c7fffffff = _mm_set1_epi32(0x7fffffff); -+ res = _mm_sub_epi32(a, b); -+ res_sat = _mm_srli_epi32(a, 31); -+ res_sat = _mm_add_epi32(res_sat, c7fffffff); -+ res_xor_a = _mm_xor_si128(res, a); -+ b_xor_a = _mm_xor_si128(b, a); -+ res_xor_a = _mm_and_si128(b_xor_a, res_xor_a); -+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise -+ res_sat = _mm_and_si128(res_xor_a, res_sat); -+ res = _mm_andnot_si128(res_xor_a, res); -+ return _mm_or_si128(res, res_sat); -+} -+ -+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution -+{ -+ _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2]; -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = atmp[0] - btmp[0]; -+ res[1] = atmp[1] - btmp[1]; -+ if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) { -+ res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64; -+ } -+ if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) { -+ res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64; -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 -+#define vqsubq_u8 _mm_subs_epu8 -+ -+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0 -+#define vqsubq_u16 _mm_subs_epu16 -+ -+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0 -+{ -+ __m128i min, mask, sub; -+ min = _MM_MIN_EPU32(a, b); //SSE4.1 -+ mask = _mm_cmpeq_epi32 (min, b); -+ sub = _mm_sub_epi32 (a, b); -+ return _mm_and_si128 ( sub, mask); -+} -+ -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b) -+ { -+ __m128i c80000000, subb, suba, cmp, sub; -+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); -+ sub = _mm_sub_epi64 (a, b); -+ suba = _mm_sub_epi64 (a, c80000000); -+ subb = _mm_sub_epi64 (b, c80000000); -+ cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!! -+ return _mm_and_si128 (sub, cmp); //saturation -+ } -+#else -+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+ { -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0; -+ res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0; -+ return _mm_load_si128((__m128i*)(res)); -+ } -+#endif -+ -+//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ****************************************************** -+//**************************************************************** -+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0 -+{ -+ //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit, -+ int8x8_t res64; -+ __m128i r16; -+ int8x8_t r; -+ r = vsub_s8 (a, b); -+ r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1 -+ r16 = _mm_srai_epi16 (r16, 1); //SSE2 -+ r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits -+ return64(r16); -+} -+ -+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vhsubq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+ -+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vhsubq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vhsubq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vhsubq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vhsubq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0 -+{ -+ // //need to deal with the possibility of internal overflow -+ __m128i c128, au,bu; -+ c128 = _mm_set1_epi8 (128); -+ au = _mm_add_epi8( a, c128); -+ bu = _mm_add_epi8( b, c128); -+ return vhsubq_u8(au,bu); -+} -+ -+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0 -+{ -+ //need to deal with the possibility of internal overflow -+ __m128i c8000, au,bu; -+ c8000 = _mm_set1_epi16(0x8000); -+ au = _mm_add_epi16( a, c8000); -+ bu = _mm_add_epi16( b, c8000); -+ return vhsubq_u16(au,bu); -+} -+ -+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0 -+{ -+ //need to deal with the possibility of internal overflow -+ __m128i a2, b2,r, b_1; -+ a2 = _mm_srai_epi32 (a,1); -+ b2 = _mm_srai_epi32 (b,1); -+ r = _mm_sub_epi32 (a2, b2); -+ b_1 = _mm_andnot_si128(a, b); //!a and b -+ b_1 = _mm_slli_epi32 (b_1,31); -+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit -+ return _mm_sub_epi32(r,b_1); -+} -+ -+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0 -+{ -+ __m128i avg; -+ avg = _mm_avg_epu8 (a, b); -+ return _mm_sub_epi8(a, avg); -+} -+ -+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0 -+{ -+ __m128i avg; -+ avg = _mm_avg_epu16 (a, b); -+ return _mm_sub_epi16(a, avg); -+} -+ -+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0 -+{ -+ //need to deal with the possibility of internal overflow -+ __m128i a2, b2,r, b_1; -+ a2 = _mm_srli_epi32 (a,1); -+ b2 = _mm_srli_epi32 (b,1); -+ r = _mm_sub_epi32 (a2, b2); -+ b_1 = _mm_andnot_si128(a, b); //!a and b -+ b_1 = _mm_slli_epi32 (b_1,31); -+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit -+ return _mm_sub_epi32(r,b_1); -+} -+ -+//******* Vector subtract high half (truncated) ** ************ -+//************************************************************ -+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sum, sum8; -+ sum = _mm_sub_epi16 (a, b); -+ sum8 = _mm_srai_epi16 (sum, 8); -+ sum8 = _mm_packs_epi16(sum8,sum8); -+ return64(sum8); -+} -+ -+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0 -+{ -+ int16x4_t res64; -+ __m128i sum, sum16; -+ sum = _mm_sub_epi32 (a, b); -+ sum16 = _mm_srai_epi32 (sum, 16); -+ sum16 = _mm_packs_epi32(sum16,sum16); -+ return64(sum16); -+} -+ -+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b) -+{ -+ int32x2_t res64; -+ __m128i sub; -+ sub = _mm_sub_epi64 (a, b); -+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); -+ return64(sub); -+} -+ -+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sum, sum8; -+ sum = _mm_sub_epi16 (a, b); -+ sum8 = _mm_srli_epi16 (sum, 8); -+ sum8 = _mm_packus_epi16(sum8,sum8); -+ return64(sum8); -+} -+ -+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0 -+{ -+ uint16x4_t res64; -+ __m128i sum, sum16; -+ sum = _mm_sub_epi32 (a, b); -+ sum16 = _mm_srli_epi32 (sum, 16); -+ sum16 = _MM_PACKUS1_EPI32(sum16); -+ return64(sum16); -+} -+ -+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 -+#define vsubhn_u64 vsubhn_s64 -+ -+//************ Vector rounding subtract high half ********************* -+//********************************************************************* -+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sub = _mm_srai_epi16 (sub, 8); //get high half -+ sub = _mm_add_epi16 (sub, mask1); //actual rounding -+ sub = _mm_packs_epi16 (sub, sub); -+ return64(sub); -+} -+ -+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0 -+{ -+ //SIMD may be not optimal, serial may be faster -+ int16x4_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sub = _mm_srai_epi32 (sub, 16); //get high half -+ sub = _mm_add_epi32 (sub, mask1); //actual rounding -+ sub = _mm_packs_epi32 (sub, sub); -+ return64(sub); -+} -+ -+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b) -+{ -+ //SIMD may be not optimal, serial may be faster -+ int32x2_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi64 (a, b); -+ mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero -+ sub = _mm_add_epi64 (sub, mask1); //actual high half rounding -+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); -+ return64(sub); -+} -+ -+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sub = _mm_srai_epi16 (sub, 8); //get high half -+ sub = _mm_add_epi16 (sub, mask1); //actual rounding -+ sub = _mm_packus_epi16 (sub, sub); -+ return64(sub); -+} -+ -+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0 -+{ -+ //SIMD may be not optimal, serial may be faster -+ uint16x4_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sub = _mm_srai_epi32 (sub, 16); //get high half -+ sub = _mm_add_epi32 (sub, mask1); //actual rounding -+ sub = _MM_PACKUS1_EPI32 (sub); -+ return64(sub); -+} -+ -+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+#define vrsubhn_u64 vrsubhn_s64 -+ -+//*********** Vector saturating doubling multiply subtract long ******************** -+//************************************************************************************ -+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) -+{ -+ //not optimal SIMD soulution, serial may be faster -+ __m128i res32, mask; -+ int32x4_t res; -+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ res = vmull_s16(b, c); -+ res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered -+ mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask); -+ res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000 -+ return vqsubq_s32(a, res32); //saturation -+} -+ -+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res64, mask; -+ int64x2_t res; -+ _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000}; -+ res = vmull_s32(b, c); -+ res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered -+ mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask); -+ res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000 -+ return vqsubq_s64(a, res64); //saturation -+} -+ -+//****************** COMPARISON *************************************** -+//******************* Vector compare equal ************************************* -+//**************************************************************************** -+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmpeq_ps(_pM128(a), _pM128(b) ); -+ return64f(res); -+} -+ -+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 -+#define vceq_p8 vceq_u8 -+ -+ -+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 -+#define vceqq_s8 _mm_cmpeq_epi8 -+ -+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 -+#define vceqq_s16 _mm_cmpeq_epi16 -+ -+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 -+#define vceqq_s32 _mm_cmpeq_epi32 -+ -+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmpeq_ps(a,b); -+ return _M128i(res); -+} -+ -+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 -+#define vceqq_u8 _mm_cmpeq_epi8 -+ -+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 -+#define vceqq_u16 _mm_cmpeq_epi16 -+ -+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 -+#define vceqq_u32 _mm_cmpeq_epi32 -+ -+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 -+#define vceqq_p8 _mm_cmpeq_epi8 -+ -+//******************Vector compare greater-than or equal************************* -+//******************************************************************************* -+//in IA SIMD no greater-than-or-equal comparison for integers, -+// there is greater-than available only, so we need the following tricks -+ -+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vcgeq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vcgeq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vcgeq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries -+ return64f(res); -+} -+ -+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vcgeq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vcgeq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b) -+{ -+ //serial solution looks faster -+ uint32x2_t res64; -+ return64(vcgeq_u32 (_pM128i(a), _pM128i(b))); -+} -+ -+ -+ -+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 -+{ -+ __m128i m1, m2; -+ m1 = _mm_cmpgt_epi8 ( a, b); -+ m2 = _mm_cmpeq_epi8 ( a, b); -+ return _mm_or_si128 ( m1, m2); -+} -+ -+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 -+{ -+ __m128i m1, m2; -+ m1 = _mm_cmpgt_epi16 ( a, b); -+ m2 = _mm_cmpeq_epi16 ( a, b); -+ return _mm_or_si128 ( m1,m2); -+} -+ -+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 -+{ -+ __m128i m1, m2; -+ m1 = _mm_cmpgt_epi32 (a, b); -+ m2 = _mm_cmpeq_epi32 (a, b); -+ return _mm_or_si128 (m1, m2); -+} -+ -+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmpge_ps(a,b); //use only 2 first entries -+ return *(__m128i*)&res; -+} -+ -+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 -+{ -+ //no unsigned chars comparison, only signed available,so need the trick -+ #ifdef USE_SSE4 -+ __m128i cmp; -+ cmp = _mm_max_epu8(a, b); -+ return _mm_cmpeq_epi8(cmp, a); //a>=b -+ #else -+ __m128i c128, as, bs, m1, m2; -+ c128 = _mm_set1_epi8 (128); -+ as = _mm_sub_epi8( a, c128); -+ bs = _mm_sub_epi8( b, c128); -+ m1 = _mm_cmpgt_epi8( as, bs); -+ m2 = _mm_cmpeq_epi8 (as, bs); -+ return _mm_or_si128 ( m1, m2); -+ #endif -+} -+ -+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 -+{ -+ //no unsigned shorts comparison, only signed available,so need the trick -+ #ifdef USE_SSE4 -+ __m128i cmp; -+ cmp = _mm_max_epu16(a, b); -+ return _mm_cmpeq_epi16(cmp, a); //a>=b -+ #else -+ __m128i c8000, as, bs, m1, m2; -+ c8000 = _mm_set1_epi16 (0x8000); -+ as = _mm_sub_epi16(a,c8000); -+ bs = _mm_sub_epi16(b,c8000); -+ m1 = _mm_cmpgt_epi16(as, bs); -+ m2 = _mm_cmpeq_epi16 (as, bs); -+ return _mm_or_si128 ( m1, m2); -+ #endif -+} -+ -+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 -+{ -+ //no unsigned ints comparison, only signed available,so need the trick -+ #ifdef USE_SSE4 -+ __m128i cmp; -+ cmp = _mm_max_epu32(a, b); -+ return _mm_cmpeq_epi32(cmp, a); //a>=b -+ #else -+ //serial solution may be faster -+ __m128i c80000000, as, bs, m1, m2; -+ c80000000 = _mm_set1_epi32 (0x80000000); -+ as = _mm_sub_epi32(a,c80000000); -+ bs = _mm_sub_epi32(b,c80000000); -+ m1 = _mm_cmpgt_epi32 (as, bs); -+ m2 = _mm_cmpeq_epi32 (as, bs); -+ return _mm_or_si128 ( m1, m2); -+ #endif -+} -+ -+//**********************Vector compare less-than or equal****************************** -+//*************************************************************************************** -+//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks -+ -+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vcleq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vcleq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vcleq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0? -+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmple_ps(_pM128(a),_pM128(b)); -+ return64f(res); -+} -+ -+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+#define vcle_u8(a,b) vcge_u8(b,a) -+ -+ -+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 -+#define vcle_u16(a,b) vcge_u16(b,a) -+ -+ -+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+#define vcle_u32(a,b) vcge_u32(b,a) -+ -+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 -+{ -+ __m128i c1, res; -+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... -+ res = _mm_cmpgt_epi8 ( a, b); -+ return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal -+} -+ -+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 -+{ -+ __m128i c1, res; -+ c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff.... -+ res = _mm_cmpgt_epi16 ( a, b); -+ return _mm_andnot_si128 (res, c1); -+} -+ -+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 -+{ -+ __m128i c1, res; -+ c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff.... -+ res = _mm_cmpgt_epi32 ( a, b); -+ return _mm_andnot_si128 (res, c1); -+} -+ -+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmple_ps(a,b); -+ return *(__m128i*)&res; -+} -+ -+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 -+ { -+ //no unsigned chars comparison in SSE, only signed available,so need the trick -+ __m128i cmp; -+ cmp = _mm_min_epu8(a, b); -+ return _mm_cmpeq_epi8(cmp, a); //a<=b -+ } -+#else -+ #define vcleq_u8(a,b) vcgeq_u8(b,a) -+#endif -+ -+ -+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 -+ { -+ //no unsigned shorts comparison in SSE, only signed available,so need the trick -+ __m128i cmp; -+ cmp = _mm_min_epu16(a, b); -+ return _mm_cmpeq_epi16(cmp, a); //a<=b -+ } -+#else -+ #define vcleq_u16(a,b) vcgeq_u16(b,a) -+#endif -+ -+ -+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 -+ { -+ //no unsigned chars comparison in SSE, only signed available,so need the trick -+ __m128i cmp; -+ cmp = _mm_min_epu32(a, b); -+ return _mm_cmpeq_epi32(cmp, a); //a<=b -+ } -+#else -+//solution may be not optimal compared with the serial one -+ #define vcleq_u32(a,b) vcgeq_u32(b,a) -+#endif -+ -+ -+//****** Vector compare greater-than ****************************************** -+//************************************************************************** -+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries -+ return64f(res); -+} -+ -+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vcgtq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vcgtq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vcgtq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+#define vcgtq_s8 _mm_cmpgt_epi8 -+ -+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+#define vcgtq_s16 _mm_cmpgt_epi16 -+ -+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+#define vcgtq_s32 _mm_cmpgt_epi32 -+ -+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmpgt_ps(a,b); //use only 2 first entries -+ return *(__m128i*)&res; -+} -+ -+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0 -+{ -+ //no unsigned chars comparison, only signed available,so need the trick -+ __m128i c128, as, bs; -+ c128 = _mm_set1_epi8 (128); -+ as = _mm_sub_epi8(a,c128); -+ bs = _mm_sub_epi8(b,c128); -+ return _mm_cmpgt_epi8 (as, bs); -+} -+ -+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0 -+{ -+ //no unsigned short comparison, only signed available,so need the trick -+ __m128i c8000, as, bs; -+ c8000 = _mm_set1_epi16 (0x8000); -+ as = _mm_sub_epi16(a,c8000); -+ bs = _mm_sub_epi16(b,c8000); -+ return _mm_cmpgt_epi16 ( as, bs); -+} -+ -+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0 -+{ -+ //no unsigned int comparison, only signed available,so need the trick -+ __m128i c80000000, as, bs; -+ c80000000 = _mm_set1_epi32 (0x80000000); -+ as = _mm_sub_epi32(a,c80000000); -+ bs = _mm_sub_epi32(b,c80000000); -+ return _mm_cmpgt_epi32 ( as, bs); -+} -+ -+//********************* Vector compare less-than ************************** -+//************************************************************************* -+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!! -+ -+ -+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!! -+ -+ -+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!! -+ -+ -+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!! -+ -+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!! -+ -+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 -+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!! -+ -+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!! -+ -+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!! -+ -+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!! -+ -+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!! -+ -+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!! -+ -+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!! -+ -+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 -+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!! -+ -+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!! -+ -+//*****************Vector compare absolute greater-than or equal ************ -+//*************************************************************************** -+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmpge_ps ( a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmpge_ps ( a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//********Vector compare absolute less-than or equal ****************** -+//******************************************************************** -+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmple_ps (a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmple_ps (a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//******** Vector compare absolute greater-than ****************** -+//****************************************************************** -+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmpgt_ps (a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmpgt_ps (a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//***************Vector compare absolute less-than *********************** -+//************************************************************************* -+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmplt_ps (a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmplt_ps (a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//*************************Vector test bits************************************ -+//***************************************************************************** -+/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them -+with the corresponding element of a second vector. If the result is not zero, the -+corresponding element in the destination vector is set to all ones. Otherwise, it is set to -+all zeros. */ -+ -+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vtstq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vtstq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vtstq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 -+#define vtst_u8 vtst_s8 -+ -+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 -+#define vtst_u16 vtst_s16 -+ -+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 -+#define vtst_u32 vtst_s32 -+ -+ -+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 -+#define vtst_p8 vtst_u8 -+ -+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0 -+{ -+ __m128i zero, one, res; -+ zero = _mm_setzero_si128 (); -+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff -+ res = _mm_and_si128 (a, b); -+ res = _mm_cmpeq_epi8 (res, zero); -+ return _mm_xor_si128(res, one); //invert result -+} -+ -+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0 -+{ -+ __m128i zero, one, res; -+ zero = _mm_setzero_si128 (); -+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff -+ res = _mm_and_si128 (a, b); -+ res = _mm_cmpeq_epi16 (res, zero); -+ return _mm_xor_si128(res, one); //invert result -+} -+ -+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0 -+{ -+ __m128i zero, one, res; -+ zero = _mm_setzero_si128 (); -+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff -+ res = _mm_and_si128 (a, b); -+ res = _mm_cmpeq_epi32 (res, zero); -+ return _mm_xor_si128(res, one); //invert result -+} -+ -+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 -+#define vtstq_u8 vtstq_s8 -+ -+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 -+#define vtstq_u16 vtstq_s16 -+ -+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 -+#define vtstq_u32 vtstq_s32 -+ -+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 -+#define vtstq_p8 vtstq_u8 -+ -+//****************** Absolute difference ******************** -+//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |***** -+//************************************************************ -+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vabdq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vabdq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vabdq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vabdq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vabdq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vabdq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = vabdq_f32(_pM128(a), _pM128(b)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_sub_epi8 (a, b); -+ return _mm_abs_epi8 (res); -+} -+ -+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_sub_epi16 (a,b); -+ return _mm_abs_epi16 (res); -+} -+ -+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_sub_epi32 (a,b); -+ return _mm_abs_epi32 (res); -+} -+ -+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned -+{ -+ __m128i cmp, difab, difba; -+ cmp = vcgtq_u8(a,b); -+ difab = _mm_sub_epi8(a,b); -+ difba = _mm_sub_epi8 (b,a); -+ difab = _mm_and_si128(cmp, difab); -+ difba = _mm_andnot_si128(cmp, difba); -+ return _mm_or_si128(difab, difba); -+} -+ -+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) -+{ -+ __m128i cmp, difab, difba; -+ cmp = vcgtq_u16(a,b); -+ difab = _mm_sub_epi16(a,b); -+ difba = _mm_sub_epi16 (b,a); -+ difab = _mm_and_si128(cmp, difab); -+ difba = _mm_andnot_si128(cmp, difba); -+ return _mm_or_si128(difab, difba); -+} -+ -+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) -+{ -+ __m128i cmp, difab, difba; -+ cmp = vcgtq_u32(a,b); -+ difab = _mm_sub_epi32(a,b); -+ difba = _mm_sub_epi32 (b,a); -+ difab = _mm_and_si128(cmp, difab); -+ difba = _mm_andnot_si128(cmp, difba); -+ return _mm_or_si128(difab, difba); -+} -+ -+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0 -+{ -+ __m128i c1; -+ __m128 res; -+ c1 = _mm_set1_epi32(0x7fffffff); -+ res = _mm_sub_ps (a, b); -+ return _mm_and_ps (res, *(__m128*)&c1); -+} -+ -+//************ Absolute difference - long ************************** -+//******************************************************************** -+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return vabdq_s16(a16, b16); -+ -+} -+ -+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, -+ return vabdq_s32(a32, b32); -+} -+ -+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //no optimal SIMD solution, serial looks faster -+ _NEON2SSE_ALIGN_16 int64_t res[2]; -+ if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0]; -+ else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0]; -+ if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1]; -+ else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) -+{ -+ __m128i res; -+ res = vsubl_u8(a,b); -+ return _mm_abs_epi16(res); -+} -+ -+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) -+{ -+ __m128i res; -+ res = vsubl_u16(a,b); -+ return _mm_abs_epi32(res); -+} -+ -+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0]; -+ else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0]; -+ if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1]; -+ else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ************* -+//********************************************************************************************* -+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) -+{ -+ int8x8_t res64; -+ return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c))); -+} -+ -+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) -+{ -+ int16x4_t res64; -+ return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c))); -+} -+ -+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) -+{ -+ int32x2_t res64; -+ return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c))); -+} -+ -+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 -+#define vaba_u8 vaba_s8 -+ -+ -+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0 -+#define vaba_u16 vaba_s16 -+ -+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) -+{ -+ uint32x2_t res64; -+ return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c))); -+} -+ -+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0 -+{ -+ int8x16_t sub; -+ sub = vabdq_s8(b, c); -+ return vaddq_s8( a, sub); -+} -+ -+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0 -+{ -+ int16x8_t sub; -+ sub = vabdq_s16(b, c); -+ return vaddq_s16( a, sub); -+} -+ -+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0 -+{ -+ int32x4_t sub; -+ sub = vabdq_s32(b, c); -+ return vaddq_s32( a, sub); -+} -+ -+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) -+{ -+ uint8x16_t sub; -+ sub = vabdq_u8(b, c); -+ return vaddq_u8( a, sub); -+} -+ -+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) -+{ -+ uint16x8_t sub; -+ sub = vabdq_u16(b, c); -+ return vaddq_u16( a, sub); -+} -+ -+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) -+{ -+ uint32x4_t sub; -+ sub = vabdq_u32(b, c); -+ return vaddq_u32( a, sub); -+} -+ -+//************** Absolute difference and accumulate - long ******************************** -+//************************************************************************************* -+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0 -+{ -+ __m128i b16, c16, res; -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1, -+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); -+ return _mm_add_epi16 (a, res); -+} -+ -+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0 -+{ -+ __m128i b32, c32, res; -+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1 -+ c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1 -+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); -+ return _mm_add_epi32 (a, res); -+} -+ -+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res; -+ res = vabdl_s32(b,c); -+ return _mm_add_epi64(a, res); -+} -+ -+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) -+{ -+ __m128i b16, c16, res; -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, -+ c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1, -+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); -+ return _mm_add_epi16 (a, res); -+} -+ -+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) -+{ -+ __m128i b32, c32, res; -+ b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1 -+ c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1 -+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); -+ return _mm_add_epi32 (a, res); -+} -+ -+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res; -+ res = vabdl_u32(b,c); -+ return _mm_add_epi64(a, res); -+} -+ -+//*********************************************************************************** -+//**************** Maximum and minimum operations ********************************** -+//*********************************************************************************** -+//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ******* -+//*********************************************************************************** -+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_max_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_max_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i res; -+ res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial -+ return64(res); -+} -+ -+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; -+ res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; -+ return res; -+} -+ -+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 -+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1 -+ -+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 -+#define vmaxq_s16 _mm_max_epi16 -+ -+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 -+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1 -+ -+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 -+#define vmaxq_u8 _mm_max_epu8 -+ -+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0 -+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1 -+ -+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 -+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1 -+ -+ -+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 -+#define vmaxq_f32 _mm_max_ps -+ -+//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** -+//*********************************************************************************************************** -+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_min_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_min_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i res; -+ res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial -+ return64(res); -+} -+ -+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; -+ res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; -+ return res; -+} -+ -+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 -+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1 -+ -+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 -+#define vminq_s16 _mm_min_epi16 -+ -+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 -+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1 -+ -+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 -+#define vminq_u8 _mm_min_epu8 -+ -+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0 -+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1 -+ -+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 -+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1 -+ -+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 -+#define vminq_f32 _mm_min_ps -+ -+//************* Pairwise addition operations. ************************************** -+//************************************************************************************ -+//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector -+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit and then pack -+ int8x8_t res64; -+ __m128i a16, b16, res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 -+ res = _mm_hadd_epi16 (a16, b16); -+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits -+ return64(res); -+} -+ -+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ __m128i hadd128; -+ hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b)); -+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(hadd128); -+} -+ -+ -+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ __m128i hadd128; -+ hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b)); -+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(hadd128); -+} -+ -+ -+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0 -+{ -+ // no 8 bit hadd in IA32, need to go to 16 bit and then pack -+ uint8x8_t res64; -+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work -+ __m128i mask8, a16, b16, res; -+ mask8 = _mm_set1_epi16(0xff); -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 -+ res = _mm_hadd_epi16 (a16, b16); -+ res = _mm_and_si128(res, mask8); //to avoid saturation -+ res = _mm_packus_epi16 (res,res); //use low 64 bits -+ return64(res); -+} -+ -+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0 -+{ -+ // solution may be not optimal, serial execution may be faster -+ // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed -+ uint16x4_t res64; -+ __m128i c32767, cfffe, as, bs, res; -+ c32767 = _mm_set1_epi16 (32767); -+ cfffe = _mm_set1_epi16 (0xfffe); -+ as = _mm_sub_epi16 (_pM128i(a), c32767); -+ bs = _mm_sub_epi16 (_pM128i(b), c32767); -+ res = _mm_hadd_epi16 (as, bs); -+ res = _mm_add_epi16 (res, cfffe); -+ res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(res); -+} -+ -+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster -+{ -+ //hadd doesn't work for unsigned values -+ uint32x2_t res64; -+ __m128i ab, ab_sh, res; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1 -+ ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0 -+ res = _mm_add_epi32(ab, ab_sh); -+ res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(res); -+} -+ -+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b) -+{ -+ __m128 hadd128; -+ __m64_128 res64; -+ hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b)); -+ hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits -+ _M64f(res64, hadd128); -+ return res64; -+} -+ -+ -+//************************** Long pairwise add ********************************** -+//********************************************************************************* -+//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width, -+// and places the final results in the destination vector. -+ -+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 -+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit anyway -+ __m128i a16; -+ int16x4_t res64; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 -+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits -+ return64(a16); -+} -+ -+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 -+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0 -+{ -+ // solution may be not optimal, serial execution may be faster -+ int32x2_t res64; -+ __m128i r32_1; -+ r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a)); -+ r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits -+ return64(r32_1); -+} -+ -+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1]; -+ return res; -+} -+ -+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0 -+{ -+ // no 8 bit hadd in IA32, need to go to 16 bit -+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work -+ uint16x4_t res64; -+ __m128i a16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits -+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits -+ return64(a16); -+} -+ -+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than a SIMD one -+ uint32x2_t res; -+ res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1]; -+ res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3]; -+ return res; -+} -+ -+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1]; -+ return res; -+} -+ -+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 -+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit -+ __m128i r16_1, r16_2; -+ r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 -+ //swap hi and low part of r to process the remaining data -+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ r16_2 = _MM_CVTEPI8_EPI16 (r16_2); -+ return _mm_hadd_epi16 (r16_1, r16_2); -+} -+ -+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 -+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit -+ __m128i r32_1, r32_2; -+ r32_1 = _MM_CVTEPI16_EPI32(a); -+ //swap hi and low part of r to process the remaining data -+ r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ r32_2 = _MM_CVTEPI16_EPI32 (r32_2); -+ return _mm_hadd_epi32 (r32_1, r32_2); -+} -+ -+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0 -+{ -+ _NEON2SSE_ALIGN_16 int32_t atmp[4]; -+ _NEON2SSE_ALIGN_16 int64_t res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ res[0] = (int64_t)atmp[0] + (int64_t)atmp[1]; -+ res[1] = (int64_t)atmp[2] + (int64_t)atmp[3]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 -+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit -+ __m128i r16_1, r16_2; -+ r16_1 = _MM_CVTEPU8_EPI16(a); -+ //swap hi and low part of r to process the remaining data -+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ r16_2 = _MM_CVTEPU8_EPI16 (r16_2); -+ return _mm_hadd_epi16 (r16_1, r16_2); -+} -+ -+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than a SIMD one -+ _NEON2SSE_ALIGN_16 uint16_t atmp[8]; -+ _NEON2SSE_ALIGN_16 uint32_t res[4]; -+ _mm_store_si128((__m128i*)atmp, a); -+ res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1]; -+ res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3]; -+ res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5]; -+ res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1]; -+ res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//************************ Long pairwise add and accumulate ************************** -+//**************************************************************************************** -+//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector, -+// and accumulates the values of the results into the elements of the destination (wide) vector -+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 -+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) -+{ -+ int16x4_t res64; -+ return64(vpadalq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 -+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) -+{ -+ int32x2_t res64; -+ return64(vpadalq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 -+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0]; -+ return res; -+} -+ -+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) -+{ -+ uint16x4_t res64; -+ return64(vpadalq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0 -+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) -+{ -+ uint32x2_t res64; -+ return64(vpadalq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 -+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0]; -+ return res; -+} -+ -+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 -+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0 -+{ -+ int16x8_t pad; -+ pad = vpaddlq_s8(b); -+ return _mm_add_epi16 (a, pad); -+} -+ -+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 -+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0 -+{ -+ int32x4_t pad; -+ pad = vpaddlq_s16(b); -+ return _mm_add_epi32(a, pad); -+} -+ -+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 -+_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) -+{ -+ int64x2_t pad; -+ pad = vpaddlq_s32(b); -+ return _mm_add_epi64 (a, pad); -+} -+ -+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 -+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0 -+{ -+ uint16x8_t pad; -+ pad = vpaddlq_u8(b); -+ return _mm_add_epi16 (a, pad); -+} -+ -+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x4_t pad; -+ pad = vpaddlq_u16(b); -+ return _mm_add_epi32(a, pad); -+} //no optimal SIMD solution, serial is faster -+ -+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //no optimal SIMD solution, serial is faster -+ uint64x2_t pad; -+ pad = vpaddlq_u32(b); -+ return _mm_add_epi64(a, pad); -+} //no optimal SIMD solution, serial is faster -+ -+//********** Folding maximum ************************************* -+//******************************************************************* -+//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors, -+//and copies the larger of each pair into the corresponding element in the destination -+// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison -+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0 -+{ -+ int8x8_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding -+ max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1 -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(max); //we need 64 bits only -+} -+ -+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ int16x4_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask -+ max = _mm_max_epi16 (ab, ab1); -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(max); -+} -+ -+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ int32x2_t res; -+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; -+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; -+ return res; -+} -+ -+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0 -+{ -+ uint8x8_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding -+ max = _mm_max_epu8 (ab, ab1); // SSE4.1 -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(max); -+} -+ -+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ uint16x4_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask -+ max = _MM_MAX_EPU16 (ab, ab1); -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(max); -+} -+ -+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ uint32x2_t res; -+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; -+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; -+ return res; -+} //serial solution looks faster than a SIMD one -+ -+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; -+ res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; -+ return res; -+} -+ -+// ***************** Folding minimum **************************** -+// ************************************************************** -+//vpmin -> takes minimum of adjacent pairs -+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0 -+{ -+ int8x8_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding -+ min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1 -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(min); -+} -+ -+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ int16x4_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask -+ min = _mm_min_epi16 (ab, ab1); -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(min); -+} -+ -+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ int32x2_t res; -+ res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; -+ res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; -+ return res; -+} -+ -+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0 -+{ -+ uint8x8_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding -+ min = _mm_min_epu8 (ab, ab1); // SSE4.1 -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(min); -+} -+ -+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ uint16x4_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask -+ min = _MM_MIN_EPU16 (ab, ab1); -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(min); -+} -+ -+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ uint32x2_t res; -+ res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0]; -+ res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0]; -+ return res; -+} -+ -+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; -+ res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; -+ return res; -+} -+ -+//*************************************************************** -+//*********** Reciprocal/Sqrt ************************************ -+//*************************************************************** -+//****************** Reciprocal estimate ******************************* -+//the ARM NEON and x86 SIMD results may be slightly different -+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 -+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = _mm_rcp_ps(_pM128(a)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! No reciprocal for ints in IA32 available -+ uint32x2_t res; -+ float resf, r; -+ int i, q, s; -+ for (i =0; i<2; i++){ -+ if((a.m64_u32[i] & 0x80000000) == 0) { -+ res.m64_u32[i] = 0xffffffff; -+ }else{ -+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); -+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ -+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res.m64_u32[i] = r * (uint32_t)(1 << 31); -+ } -+ } -+ return res; -+} -+ -+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 -+#define vrecpeq_f32 _mm_rcp_ps -+ -+ -+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! -+ //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double -+ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; -+ _NEON2SSE_ALIGN_16 uint32_t res[4]; -+ _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000}; -+ float resf, r; -+ int i, q, s; -+ __m128i res128, mask, zero; -+ _mm_store_si128((__m128i*)atmp, a); -+ zero = _mm_setzero_si128(); -+ for (i =0; i<4; i++){ -+ resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31)) -+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ -+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); -+ } -+ res128 = _mm_load_si128((__m128i*)res); -+ mask = _mm_and_si128(a, *(__m128i*)c80000000); -+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff -+ return _mm_or_si128(res128, mask); -+} -+ -+//**********Reciprocal square root estimate **************** -+//********************************************************** -+//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster -+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers -+////the ARM NEON and x86 SIMD results may be slightly different -+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 -+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = _mm_rsqrt_ps(_pM128(a)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! -+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double -+ uint32x2_t res; -+ __m128 tmp; -+ float r, resf, coeff; -+ int i,q0, q1, s;; -+ for (i =0; i<2; i++){ -+ if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff -+ res.m64_u32[i] = 0xffffffff; -+ }else{ -+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); -+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ -+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ -+ r = ((float)q0 + 0.5) / coeff; -+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ -+ _mm_store_ss(&r, tmp); -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res.m64_u32[i] = r * (((uint32_t)1) << 31); -+ } -+ } -+ return res; -+} -+ -+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 -+#define vrsqrteq_f32 _mm_rsqrt_ps -+ -+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! -+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double -+ _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4]; -+ _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)}; -+ _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000}; -+ __m128 tmp; -+ __m128i res128, mask, zero; -+ float r, resf, coeff; -+ int i,q0, q1, s; -+ _mm_store_si128((__m128i*)atmp, a); -+ zero = _mm_setzero_si128(); -+ for (i =0; i<4; i++){ -+ resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31))); -+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ -+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ -+ r = ((float)q0 + 0.5) / coeff; -+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ -+ _mm_store_ss(&r, tmp); -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); -+ } -+ res128 = _mm_load_si128((__m128i*)res); -+ mask = _mm_and_si128(a, *(__m128i*)c_c0000000); -+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff -+ return _mm_or_si128(res128, mask); -+} -+//************ Reciprocal estimate/step and 1/sqrt estimate/step *************************** -+//****************************************************************************************** -+//******VRECPS (Vector Reciprocal Step) *************************************************** -+//multiplies the elements of one vector by the corresponding elements of another vector, -+//subtracts each of the results from 2, and places the final results into the elements of the destination vector. -+ -+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 -+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = vrecpsq_f32(_pM128(a), _pM128(b)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 -+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0 -+{ -+ __m128 f2, mul; -+ f2 = _mm_set1_ps(2.); -+ mul = _mm_mul_ps(a,b); -+ return _mm_sub_ps(f2,mul); -+} -+ -+//*****************VRSQRTS (Vector Reciprocal Square Root Step) ***************************** -+//multiplies the elements of one vector by the corresponding elements of another vector, -+//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector. -+ -+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 -+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2; -+ res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2; -+ return res; -+} -+ -+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 -+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0 -+{ -+ __m128 f3, f05, mul; -+ f3 = _mm_set1_ps(3.); -+ f05 = _mm_set1_ps(0.5); -+ mul = _mm_mul_ps(a,b); -+ f3 = _mm_sub_ps(f3,mul); -+ return _mm_mul_ps (f3, f05); -+} -+//******************************************************************************************** -+//***************************** Shifts by signed variable *********************************** -+//******************************************************************************************** -+//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) *********************** -+//******************************************************************************************** -+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution -+//helper macro. It matches ARM implementation for big shifts -+#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \ -+ else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \ -+ for (i = 0; i= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \ -+ else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \ -+ return res; -+ -+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(8, i, 8) -+} -+ -+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(16, i, 4) -+} -+ -+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(32, i, 2) -+} -+ -+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(64, i, 1) -+} -+ -+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(8, u, 8) -+} -+ -+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(16, u, 4) -+} -+ -+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(32, u, 2) -+} -+ -+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers -+{ -+ SERIAL_SHIFT_64(64, u, 1) -+} -+ -+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int8_t, int8_t, 16, 16) -+} -+ -+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int16_t, int16_t, 8, 8) -+} -+ -+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int32_t, int32_t, 4, 4) -+} -+ -+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int64_t, int64_t, 2, 2) -+} -+ -+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint8_t, int8_t, 16, 16) -+} -+ -+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint16_t, int16_t, 8, 8) -+} -+ -+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint32_t, int32_t, 4, 4) -+} -+ -+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint64_t, int64_t, 2, 2) -+} -+ -+ -+//*********** Vector saturating shift left: (negative values shift right) ********************** -+//******************************************************************************************** -+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution -+#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ -+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i]); \ -+ else{ \ -+ if (btmp[i]>lanesize_1) { \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ -+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ else res[i] = atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ -+ TYPE lanesize = (sizeof(TYPE) << 3); \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i]); \ -+ else{ \ -+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ -+ else{ \ -+ limit = (TYPE) 1 << (lanesize - btmp[i]); \ -+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \ -+ int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize_1) { \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ -+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ -+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ -+ return res; -+ -+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ -+ int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ -+ else{ \ -+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ -+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \ -+ return res; -+ -+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(8,8) -+} -+ -+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(16,4) -+} -+ -+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(32,2) -+} -+ -+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(64,1) -+} -+ -+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8) -+} -+ -+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4) -+} -+ -+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2) -+} -+ -+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1) -+} -+ -+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16) -+} -+ -+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8) -+} -+ -+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4) -+} -+ -+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2) -+} -+ -+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16) -+} -+ -+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8) -+} -+ -+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4) -+} -+ -+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2) -+} -+ -+ -+//******** Vector rounding shift left: (negative values shift right) ********** -+//**************************************************************************** -+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution -+//rounding makes sense for right shifts only. -+#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i= 0) { \ -+ if(btmp[i] >= lanesize) res[i] = 0; \ -+ else res[i] = (atmp[i] << btmp[i]); \ -+ }else{ \ -+ res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \ -+ (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \ -+ (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \ -+ return _mm_load_si128((__m128i*)res); -+ -+ -+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \ -+ for (i = 0; i= 0) { \ -+ if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \ -+ else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \ -+ }else{ \ -+ res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \ -+ (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \ -+ (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \ -+ return res; -+ -+ -+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(8,i,8) -+} -+ -+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(16,i,4) -+} -+ -+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(32,i,2) -+} -+ -+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(64,i,1) -+} -+ -+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(8,u,8) -+} -+ -+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(16,u,4) -+} -+ -+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(32,u,2) -+} -+ -+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(64,u,1) -+} -+ -+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16) -+} -+ -+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8) -+} -+ -+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4) -+} -+ -+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2) -+} -+ -+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16) -+} -+ -+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8) -+} -+ -+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4) -+} -+ -+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2) -+} -+ -+ -+//********** Vector saturating rounding shift left: (negative values shift right) **************** -+//************************************************************************************************* -+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution -+//Saturation happens for left shifts only while rounding makes sense for right shifts only. -+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ -+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ -+ else{ \ -+ if (btmp[i]>lanesize_1) { \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ -+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ else res[i] = atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ -+ int lanesize = (sizeof(TYPE) << 3); \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ -+ else{ \ -+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ -+ else{ \ -+ limit = (TYPE) 1 << (lanesize - btmp[i]); \ -+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \ -+ __m64_128 res; int ## TYPE ## _t limit; int i; \ -+ int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize_1) { \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ -+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ -+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ -+ return res; -+ -+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \ -+ __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ -+ int lanesize = (sizeof(int ## TYPE ## _t) << 3); \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ -+ else{ \ -+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ -+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ -+ return res; -+ -+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8) -+} -+ -+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4) -+} -+ -+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2) -+} -+ -+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1) -+} -+ -+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8) -+} -+ -+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4) -+} -+ -+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2) -+} -+ -+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1) -+} -+ -+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16) -+} -+ -+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8) -+} -+ -+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4) -+} -+ -+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2) -+} -+ -+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16) -+} -+ -+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8) -+} -+ -+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4) -+} -+ -+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2) -+} -+ -+// ********************************************************************************* -+// ***************************** Shifts by a constant ***************************** -+// ********************************************************************************* -+//**************** Vector shift right by constant************************************* -+//************************************************************************************ -+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit -+ int8x8_t res64; -+ __m128i r; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_srai_epi16 (r, b); //SSE2 -+ r = _mm_packs_epi16 (r,r); //we need 64 bits only -+ return64(r); -+} -+ -+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b) -+{ -+ int16x4_t res64; -+ return64(_mm_srai_epi16(_pM128i(a), b)); -+} -+ -+ -+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ return64(_mm_srai_epi32(_pM128i(a), b)); -+} -+ -+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //no arithmetic shift for 64bit values, serial solution used -+ int64x1_t res; -+ if(b>=64) res.m64_i64[0] = 0; -+ else res.m64_i64[0] = (*(int64_t*)&a) >> b; -+ return res; -+} -+ -+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit -+ uint8x8_t res64; -+ __m128i r; -+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one -+ r = _mm_packus_epi16 (r,r); //we need 64 bits only -+ return64(r); -+} -+ -+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b) -+{ -+ uint16x4_t res64; -+ return64(_mm_srli_epi16(_pM128i(a), b)); -+} -+ -+ -+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res64; -+ return64(_mm_srli_epi32(_pM128i(a), b)); -+} -+ -+ -+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 -+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b) -+{ -+ uint64x1_t res64; -+ return64(_mm_srli_epi64(_pM128i(a), b)); -+} -+ -+ -+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit trick -+ __m128i zero, mask0, a_sign, r, a_sign_mask; -+ _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff}; -+ zero = _mm_setzero_si128(); -+ mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift -+ a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0 -+ r = _mm_srai_epi16 (a, b); -+ a_sign_mask = _mm_and_si128 (mask0, a_sign); -+ r = _mm_andnot_si128 (mask0, r); -+ return _mm_or_si128 (r, a_sign_mask); -+} -+ -+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 -+#define vshrq_n_s16 _mm_srai_epi16 -+ -+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 -+#define vshrq_n_s32 _mm_srai_epi32 -+ -+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b) -+{ -+ //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD -+ __m128i c1, signmask,a0, res64; -+ _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000}; -+ c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff -+ signmask = _mm_slli_epi64 (c1, (64 - b)); -+ a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit -+ a0 = _MM_CMPEQ_EPI64 (a, a0); -+ signmask = _mm_and_si128(a0, signmask); -+ res64 = _mm_srli_epi64 (a, b); -+ return _mm_or_si128(res64, signmask); -+} -+ -+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8 -+{ -+ //no 8 bit shift available, need the special trick -+ __m128i mask0, r; -+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00}; -+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift -+ r = _mm_srli_epi16 ( a, b); -+ return _mm_and_si128 (r, mask0); -+} -+ -+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16 -+#define vshrq_n_u16 _mm_srli_epi16 -+ -+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 -+#define vshrq_n_u32 _mm_srli_epi32 -+ -+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 -+#define vshrq_n_u64 _mm_srli_epi64 -+ -+//*************************** Vector shift left by constant ************************* -+//********************************************************************************* -+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0 -+{ -+ //no 8 bit shift available, go to 16 bit -+ int8x8_t res64; -+ __m128i r; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_slli_epi16 (r, b); //SSE2 -+ r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only -+ return64(r); -+} -+ -+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b) -+{ -+ int16x4_t res64; -+ return64(_mm_slli_epi16(_pM128i(a), b)); -+} -+ -+ -+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b) -+{ -+ int32x2_t res64; -+ return64(_mm_slli_epi32(_pM128i(a), b)); -+} -+ -+ -+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b) -+{ -+ int64x1_t res64; -+ return64(_mm_slli_epi64(_pM128i(a), b)); -+} -+ -+ -+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b) -+{ -+ //no 8 bit shift available, go to 16 bit -+ uint8x8_t res64; -+ __m128i mask8; -+ __m128i r; -+ mask8 = _mm_set1_epi16(0xff); -+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_slli_epi16 (r, b); //SSE2 -+ r = _mm_and_si128(r, mask8); //to avoid saturation -+ r = _mm_packus_epi16 (r,r); //we need 64 bits only -+ return64(r); -+} -+ -+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+#define vshl_n_u16 vshl_n_s16 -+ -+ -+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+#define vshl_n_u32 vshl_n_s32 -+ -+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+#define vshl_n_u64 vshl_n_s64 -+ -+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+#define vshlq_n_s8 vshlq_n_u8 -+ -+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+#define vshlq_n_s16 _mm_slli_epi16 -+ -+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+#define vshlq_n_s32 _mm_slli_epi32 -+ -+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+#define vshlq_n_s64 _mm_slli_epi64 -+ -+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) -+{ -+ //no 8 bit shift available, need the special trick -+ __m128i mask0, r; -+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff}; -+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift -+ r = _mm_slli_epi16 ( a, b); -+ return _mm_and_si128 (r, mask0); -+} -+ -+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+#define vshlq_n_u16 vshlq_n_s16 -+ -+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+#define vshlq_n_u32 vshlq_n_s32 -+ -+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+#define vshlq_n_u64 vshlq_n_s64 -+ -+//************* Vector rounding shift right by constant ****************** -+//************************************************************************* -+//No corresponding x86 intrinsics exist, need to do some tricks -+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit -+ int8x8_t res64; -+ __m128i r, maskb; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 -+ r = _mm_srai_epi16 (r, b); -+ r = _mm_add_epi16 (r, maskb); //actual rounding -+ r = _mm_packs_epi16 (r,r); ////we need 64 bits only -+ return64(r); -+} -+ -+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b) -+{ -+ int16x4_t res64; -+ return64(vrshrq_n_s16(_pM128i(a), b)); -+} -+ -+ -+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ return64(vrshrq_n_s32(_pM128i(a), b)); -+} -+ -+ -+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution is faster -+ int64x1_t res; -+ int64_t a_i64 = *( int64_t*)&a; -+ if(b==64) { -+ res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63; -+ } else { -+ int64_t maskb = a_i64 & (( int64_t)1 << (b - 1)); -+ res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1)); -+ } -+ return res; -+} -+ -+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one -+ uint8x8_t res64; -+ __m128i r, maskb; -+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 -+ r = _mm_srli_epi16 (r, b); -+ r = _mm_add_epi16 (r, maskb); //actual rounding -+ r = _mm_packus_epi16 (r,r); ////we need 64 bits only -+ return64(r); -+} -+ -+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b) -+{ -+ uint16x4_t res64; -+ return64(vrshrq_n_u16(_pM128i(a), b)); -+} -+ -+ -+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res64; -+ return64(vrshrq_n_u32(_pM128i(a), b)); -+} -+ -+ -+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 -+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b) -+{ -+ uint64x1_t res64; -+ return64(vrshrq_n_u64(_pM128i(a), b)); -+} -+ -+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit trick -+ __m128i r, mask1, maskb; -+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 -+ r = vshrq_n_s8 (a, b); -+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding -+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding -+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 -+ return _mm_add_epi8(r, maskb); //actual rounding -+} -+ -+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 -+ r = _mm_srai_epi16 (a, b); -+ return _mm_add_epi16 (r, maskb); //actual rounding -+} -+ -+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 -+ r = _mm_srai_epi32(a, b); -+ return _mm_add_epi32 (r, maskb); //actual rounding -+} -+ -+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b) -+{ -+ //solution may be not optimal compared with a serial one -+ __m128i maskb; -+ int64x2_t r; -+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 -+ r = vshrq_n_s64(a, b); -+ return _mm_add_epi64 (r, maskb); //actual rounding -+} -+ -+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit trick -+ __m128i r, mask1, maskb; -+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 -+ r = vshrq_n_u8 (a, b); -+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding -+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding -+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 -+ return _mm_add_epi8(r, maskb); //actual rounding -+} -+ -+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16 -+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 -+ r = _mm_srli_epi16 (a, b); -+ return _mm_add_epi16 (r, maskb); //actual rounding -+} -+ -+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 -+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 -+ r = _mm_srli_epi32(a, b); -+ return _mm_add_epi32 (r, maskb); //actual rounding -+} -+ -+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 -+_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b) -+{ -+ //solution may be not optimal compared with a serial one -+ __m128i maskb, r; -+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 -+ r = _mm_srli_epi64(a, b); -+ return _mm_add_epi64 (r, maskb); //actual rounding -+} -+ -+//************* Vector shift right by constant and accumulate ********* -+//********************************************************************* -+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8 -+{ -+ int8x8_t shift; -+ shift = vshr_n_s8(b, c); -+ return vadd_s8( a, shift); -+} -+ -+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16 -+{ -+ int16x4_t shift; -+ shift = vshr_n_s16( b, c); -+ return vadd_s16(a, shift); -+} -+ -+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ int32x2_t shift; -+ shift = vshr_n_s32(b, c); -+ return vadd_s32( a, shift); -+} -+ -+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 -+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) -+{ -+ //may be not optimal compared with a serial solution -+ int64x1_t shift; -+ shift = vshr_n_s64(b, c); -+ return vadd_s64( a, shift); -+} -+ -+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8 -+{ -+ uint8x8_t shift; -+ shift = vshr_n_u8(b, c); -+ return vadd_u8(a, shift); -+} -+ -+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16 -+{ -+ uint16x4_t shift; -+ shift = vshr_n_u16(b, c); -+ return vadd_u16(a,shift); -+} -+ -+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ uint32x2_t shift; -+ shift = vshr_n_u32(b, c); -+ return vadd_u32( a, shift); -+} -+ -+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 -+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64 -+{ -+ //may be not optimal compared with the serial execution -+ uint64x1_t shift; -+ shift = vshr_n_u64(b, c); -+ return vadd_u64(a, shift); -+} -+ -+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8 -+{ -+ int8x16_t shift; -+ shift = vshrq_n_s8(b, c); -+ return vaddq_s8(a, shift); -+} -+ -+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16 -+{ -+ int16x8_t shift; -+ shift = vshrq_n_s16(b, c); -+ return vaddq_s16(a, shift); -+} -+ -+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32 -+{ -+ int32x4_t shift; -+ shift = vshrq_n_s32(b, c); -+ return vaddq_s32(a, shift); -+} -+ -+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64 -+{ -+ int64x2_t shift; -+ shift = vshrq_n_s64(b, c); -+ return vaddq_s64( a, shift); -+} -+ -+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8 -+{ -+ uint8x16_t shift; -+ shift = vshrq_n_u8(b, c); -+ return vaddq_u8(a, shift); -+} -+ -+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16 -+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16 -+{ -+ uint16x8_t shift; -+ shift = vshrq_n_u16(b, c); -+ return vaddq_u16(a, shift); -+} -+ -+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 -+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32 -+{ -+ uint32x4_t shift; -+ shift = vshrq_n_u32(b, c); -+ return vaddq_u32(a, shift); -+} -+ -+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 -+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64 -+{ -+ uint64x2_t shift; -+ shift = vshrq_n_u64(b, c); -+ return vaddq_u64(a, shift); -+} -+ -+//************* Vector rounding shift right by constant and accumulate **************************** -+//************************************************************************************************ -+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8 -+{ -+ int8x8_t shift; -+ shift = vrshr_n_s8(b, c); -+ return vadd_s8( a, shift); -+} -+ -+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16 -+{ -+ int16x4_t shift; -+ shift = vrshr_n_s16( b, c); -+ return vadd_s16(a, shift); -+} -+ -+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ int32x2_t shift; -+ shift = vrshr_n_s32(b, c); -+ return vadd_s32( a, shift); -+} -+ -+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution -+{ -+ int64x1_t shift; -+ shift = vrshr_n_s64(b, c); -+ return vadd_s64( a, shift); -+} -+ -+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8 -+{ -+ uint8x8_t shift; -+ shift = vrshr_n_u8(b, c); -+ return vadd_u8(a, shift); -+} -+ -+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16 -+{ -+ uint16x4_t shift; -+ shift = vrshr_n_u16(b, c); -+ return vadd_u16(a,shift); -+} -+ -+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ uint32x2_t shift; -+ shift = vrshr_n_u32(b, c); -+ return vadd_u32( a, shift); -+} -+ -+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution -+{ -+ //may be not optimal compared with the serial execution -+ uint64x1_t shift; -+ shift = vrshr_n_u64(b, c); -+ return vadd_u64( a, shift); -+} -+ -+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8 -+{ -+ int8x16_t shift; -+ shift = vrshrq_n_s8(b, c); -+ return vaddq_s8(a, shift); -+} -+ -+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16 -+{ -+ int16x8_t shift; -+ shift = vrshrq_n_s16(b, c); -+ return vaddq_s16(a, shift); -+} -+ -+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32 -+{ -+ int32x4_t shift; -+ shift = vrshrq_n_s32(b, c); -+ return vaddq_s32(a, shift); -+} -+ -+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) -+{ -+ int64x2_t shift; -+ shift = vrshrq_n_s64(b, c); -+ return vaddq_s64(a, shift); -+} -+ -+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8 -+{ -+ uint8x16_t shift; -+ shift = vrshrq_n_u8(b, c); -+ return vaddq_u8(a, shift); -+} -+ -+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16 -+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16 -+{ -+ uint16x8_t shift; -+ shift = vrshrq_n_u16(b, c); -+ return vaddq_u16(a, shift); -+} -+ -+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 -+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32 -+{ -+ uint32x4_t shift; -+ shift = vrshrq_n_u32(b, c); -+ return vaddq_u32(a, shift); -+} -+ -+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 -+_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) -+{ -+ uint64x2_t shift; -+ shift = vrshrq_n_u64(b, c); -+ return vaddq_u64(a, shift); -+} -+ -+//**********************Vector saturating shift left by constant ***************************** -+//******************************************************************************************** -+//we don't check const ranges assuming they are met -+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 -+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0 -+{ -+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) -+ int8x8_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi16 (a128, b); -+ r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only -+ return64(r128); -+} -+ -+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 -+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0 -+{ -+ // go to 32 bit to get the auto saturation (in packs function) -+ int16x4_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi32 (a128, b); //shift_res -+ r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only -+ return64(r128); -+} -+ -+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b) -+{ -+ //serial execution may be faster -+ int32x2_t res64; -+ return64(vqshlq_n_s32 (_pM128i(a), b)); -+} -+ -+ -+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ int64x1_t res; -+ int64_t bmask; -+ int64_t a_i64 = *( int64_t*)&a; -+ bmask = ( int64_t)1 << (63 - b); //positive -+ if (a_i64 >= bmask) { -+ res.m64_i64[0] = ~(_SIGNBIT64); -+ } else { -+ res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b; -+ } -+ return res; -+} -+ -+ -+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 -+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0 -+{ -+ //no 8 bit shift available in IA32 SIMD, go to 16 bit -+ uint8x8_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi16 (a128, b); //shift_res -+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only -+ return64(r128); -+} -+ -+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0 -+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0 -+{ -+ // go to 32 bit to get the auto saturation (in packus function) -+ uint16x4_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi32 (a128, b); //shift_res -+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16 -+ return64(r128); -+} -+ -+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 -+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b) -+{ -+ uint32x2_t res64; -+ return64(vqshlq_n_u32(_pM128i(a), b)); -+} -+ -+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ uint64x1_t res; -+ uint64_t bmask; -+ uint64_t a_i64 = *(uint64_t*)&a; -+ bmask = ( uint64_t)1 << (64 - b); -+ res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a -+ return res; -+} -+ -+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 -+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0 -+{ -+ // go to 16 bit to get the auto saturation (in packs function) -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi16 (a128, b); -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI8_EPI16 (a128); -+ r128_2 = _mm_slli_epi16 (a128, b); -+ return _mm_packs_epi16 (r128_1, r128_2); //saturated s8 -+} -+ -+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 -+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0 -+{ -+ // manual saturation solution looks LESS optimal than 32 bits conversion one -+ // go to 32 bit to get the auto saturation (in packs function) -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI16_EPI32 (a128); -+ r128_2 = _mm_slli_epi32 (a128, b); -+ return _mm_packs_epi32 (r128_1, r128_2); //saturated s16 -+} -+ -+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 -+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0 -+{ -+ // no 64 bit saturation option available, special tricks necessary -+ __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask; -+ c1 = _mm_cmpeq_epi32(a,a); //0xff..ff -+ maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones -+ saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise -+ c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not -+ shift_res = _mm_slli_epi32 (a, b); -+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); -+ //result with positive numbers saturated -+ shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask); -+ //treat negative numbers -+ maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros -+ saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise -+ c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not -+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); -+ return _mm_or_si128 (c7ffffff_mask, shift_res_mask); -+} -+ -+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2]; -+ int64_t bmask; -+ int i; -+ bmask = ( int64_t)1 << (63 - b); //positive -+ _mm_store_si128((__m128i*)atmp, a); -+ for (i = 0; i<2; i++) { -+ if (atmp[i] >= bmask) { -+ res[i] = ~(_SIGNBIT64); -+ } else { -+ res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b; -+ } -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 -+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0 -+{ -+ // go to 16 bit to get the auto saturation (in packs function) -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi16 (a128, b); -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPU8_EPI16 (a128); -+ r128_2 = _mm_slli_epi16 (a128, b); -+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 -+} -+ -+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0 -+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0 -+{ -+ // manual saturation solution looks more optimal than 32 bits conversion one -+ __m128i cb, c8000, a_signed, saturation_mask, shift_res; -+ cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 ); -+ c8000 = _mm_set1_epi16 (0x8000); -+//no unsigned shorts comparison in SSE, only signed available, so need the trick -+ a_signed = _mm_sub_epi16(a, c8000); //go to signed -+ saturation_mask = _mm_cmpgt_epi16 (a_signed, cb); -+ shift_res = _mm_slli_epi16 (a, b); -+ return _mm_or_si128 (shift_res, saturation_mask); -+} -+ -+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 -+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0 -+{ -+ // manual saturation solution, no 64 bit saturation option, the serial version may be faster -+ __m128i cb, c80000000, a_signed, saturation_mask, shift_res; -+ cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 ); -+ c80000000 = _mm_set1_epi32 (0x80000000); -+//no unsigned ints comparison in SSE, only signed available, so need the trick -+ a_signed = _mm_sub_epi32(a, c80000000); //go to signed -+ saturation_mask = _mm_cmpgt_epi32 (a_signed, cb); -+ shift_res = _mm_slli_epi32 (a, b); -+ return _mm_or_si128 (shift_res, saturation_mask); -+} -+ -+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2]; -+ uint64_t bmask; -+ int i; -+ bmask = ( uint64_t)1 << (64 - b); -+ _mm_store_si128((__m128i*)atmp, a); -+ for (i = 0; i<2; i++) { -+ res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//**************Vector signed->unsigned saturating shift left by constant ************* -+//************************************************************************************* -+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 -+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0 -+{ -+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) -+ uint8x8_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi16 (a128, b); -+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only -+ return64(r128); -+} -+ -+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 -+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0 -+{ -+ uint16x4_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi32 (a128, b); //shift_res -+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only -+ return64(r128); -+} -+ -+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b) -+{ -+ int32x2_t res64; -+ return64( vqshluq_n_s32(_pM128i(a), b)); -+} -+ -+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster -+{ -+ uint64x1_t res; -+ uint64_t limit; -+ if (a.m64_i64[0]<=0) { -+ res.m64_u64[0] = 0; -+ } else { -+ limit = (uint64_t) 1 << (64 - b); -+ res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b; -+ } -+ return res; -+} -+ -+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 -+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0 -+{ -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi16 (a128, b); -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI8_EPI16 (a128); -+ r128_2 = _mm_slli_epi16 (a128, b); -+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 -+} -+ -+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 -+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0 -+{ -+ // manual saturation solution looks LESS optimal than 32 bits conversion one -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI16_EPI32 (a128); -+ r128_2 = _mm_slli_epi32 (a128, b); -+ return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16 -+} -+ -+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 -+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0 -+{ -+ //solution may be not optimal compared with the serial one -+ __m128i zero, maskA, maskGT0, a0, a_masked, a_shift; -+ zero = _mm_setzero_si128(); -+ maskA = _mm_cmpeq_epi32(a, a); -+ maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros -+ //saturate negative numbers to zero -+ maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers) -+ a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now -+ //saturate positive to 0xffffffff -+ a_masked = _mm_and_si128 (a0, maskA); -+ a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise -+ a_shift = _mm_slli_epi32 (a0, b); -+ return _mm_or_si128 (a_shift, a_masked); //actual saturation -+} -+ -+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here, serial execution looks faster -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ uint64_t limit; -+ int i; -+ _mm_store_si128((__m128i*)atmp, a); -+ for (i = 0; i<2; i++) { -+ if (atmp[i]<=0) { -+ res[i] = 0; -+ } else { -+ limit = (uint64_t) 1 << (64 - b); -+ res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b; -+ } -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//************** Vector narrowing shift right by constant ************** -+//********************************************************************** -+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r16 = vshrq_n_s16(a,b); -+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r16); -+} -+ -+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r32 = vshrq_n_s32(a,b); -+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r32); -+} -+ -+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ __m128i r64; -+ r64 = vshrq_n_s64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 -+{ -+ uint8x8_t res64; -+ __m128i mask, r16; -+ mask = _mm_set1_epi16(0xff); -+ r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_and_si128(r16, mask); //to avoid saturation -+ r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i mask, r32; -+ mask = _mm_set1_epi32(0xffff); -+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16) -+ r32 = _mm_and_si128(r32, mask); //to avoid saturation -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res64; -+ __m128i r64; -+ r64 = vshrq_n_u64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+//************** Vector signed->unsigned narrowing saturating shift right by constant ******** -+//********************************************************************************************* -+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8 -+{ -+ uint8x8_t res64; -+ __m128i r16; -+ r16 = vshrq_n_s16(a,b); -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i r32; -+ r32 = vshrq_n_s32(a,b); -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster -+{ -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ uint32x2_t res; -+ int64_t res64; -+ _mm_store_si128((__m128i*)atmp, a); -+ if (atmp[0] < 0) { -+ res.m64_u32[0] = 0; -+ } else { -+ res64 = (atmp[0] >> b); -+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64; -+ } -+ if (atmp[1] < 0) { -+ res.m64_u32[1] = 0; -+ } else { -+ res64 = (atmp[1] >> b); -+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64; -+ } -+ return res; -+} -+ -+//**** Vector signed->unsigned rounding narrowing saturating shift right by constant ***** -+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8 -+{ -+ //solution may be not optimal compared with the serial one -+ __m128i r16; -+ uint8x8_t res64; -+ r16 = vrshrq_n_s16(a,b); -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16 -+{ -+ //solution may be not optimal compared with the serial one -+ __m128i r32; -+ uint16x4_t res64; -+ r32 = vrshrq_n_s32(a,b); -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster -+{ -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ uint32x2_t res; -+ int64_t res64; -+ _mm_store_si128((__m128i*)atmp, a); -+ if (atmp[0] < 0) { -+ res.m64_u32[0] = 0; -+ } else { -+ res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); -+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; -+ } -+ if (atmp[1] < 0) { -+ res.m64_u32[1] = 0; -+ } else { -+ res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); -+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; -+ } -+ return res; -+} -+ -+//***** Vector narrowing saturating shift right by constant ****** -+//***************************************************************** -+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ r16 = vshrq_n_s16(a,b); -+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ r32 = vshrq_n_s32(a,b); -+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //no optimal SIMD solution found -+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2]; -+ int32x2_t res; -+ _mm_store_si128((__m128i*)atmp, a); -+ res64[0] = (atmp[0] >> b); -+ res64[1] = (atmp[1] >> b); -+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; -+ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; -+ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i r32; -+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) -+{ -+ //serial solution may be faster -+ uint32x2_t res64; -+ __m128i r64, res_hi, zero; -+ zero = _mm_setzero_si128(); -+ r64 = vshrq_n_u64(a,b); -+ res_hi = _mm_srli_epi64(r64, 32); -+ res_hi = _mm_cmpgt_epi32(res_hi, zero); -+ r64 = _mm_or_si128(r64, res_hi); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+ -+//********* Vector rounding narrowing shift right by constant ************************* -+//**************************************************************************************** -+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r16 = vrshrq_n_s16(a,b); -+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r16); -+} -+ -+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r32 = vrshrq_n_s32(a,b); -+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r32); -+} -+ -+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ __m128i r64; -+ r64 = vrshrq_n_s64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 -+{ -+ uint8x8_t res64; -+ __m128i mask, r16; -+ mask = _mm_set1_epi16(0xff); -+ r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_and_si128(r16, mask); //to avoid saturation -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i mask, r32; -+ mask = _mm_set1_epi32(0xffff); -+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) -+ r32 = _mm_and_si128(r32, mask); //to avoid saturation -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster -+{ -+ uint32x2_t res64; -+ __m128i r64; -+ r64 = vrshrq_n_u64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+//************* Vector rounding narrowing saturating shift right by constant ************ -+//**************************************************************************************** -+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ r16 = vrshrq_n_s16(a,b); -+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ r32 = vrshrq_n_s32(a,b); -+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //no optimal SIMD solution found -+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2]; -+ int32x2_t res; -+ _mm_store_si128((__m128i*)atmp, a); -+ maskb[0] = atmp[0] & (( int64_t)1 << (b - 1)); -+ res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result -+ maskb[1] = atmp[1] & (( int64_t)1 << (b - 1)); -+ res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result -+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; -+ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; -+ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i r32; -+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) -+{ -+ //serial solution may be faster -+ uint32x2_t res64; -+ __m128i r64, res_hi, zero; -+ zero = _mm_setzero_si128(); -+ r64 = vrshrq_n_u64(a,b); -+ res_hi = _mm_srli_epi64(r64, 32); -+ res_hi = _mm_cmpgt_epi32(res_hi, zero); -+ r64 = _mm_or_si128(r64, res_hi); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+//************** Vector widening shift left by constant **************** -+//************************************************************************ -+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 -+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0 -+{ -+ __m128i r; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ return _mm_slli_epi16 (r, b); -+} -+ -+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 -+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0 -+{ -+ __m128i r; -+ r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1, -+ return _mm_slli_epi32 (r, b); -+} -+ -+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 -+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0 -+{ -+ __m128i r; -+ r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1, -+ return _mm_slli_epi64 (r, b); -+} -+ -+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 -+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0 -+{ -+ //no uint8 to uint16 conversion available, manual conversion used -+ __m128i zero, r; -+ zero = _mm_setzero_si128 (); -+ r = _mm_unpacklo_epi8(_pM128i(a), zero); -+ return _mm_slli_epi16 (r, b); -+} -+ -+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0 -+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0 -+{ -+ //no uint16 to uint32 conversion available, manual conversion used -+ __m128i zero, r; -+ zero = _mm_setzero_si128 (); -+ r = _mm_unpacklo_epi16(_pM128i(a), zero); -+ return _mm_slli_epi32 (r, b); -+} -+ -+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 -+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0 -+{ -+ //no uint32 to uint64 conversion available, manual conversion used -+ __m128i zero, r; -+ zero = _mm_setzero_si128 (); -+ r = _mm_unpacklo_epi32(_pM128i(a), zero); -+ return _mm_slli_epi64 (r, b); -+} -+ -+//************************************************************************************ -+//**************************** Shifts with insert ************************************ -+//************************************************************************************ -+//takes each element in a vector, shifts them by an immediate value, -+//and inserts the results in the destination vector. Bits shifted out of the each element are lost. -+ -+//**************** Vector shift right and insert ************************************ -+//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift. -+//All other bits are taken from b shifted. -+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) -+{ -+ int8x8_t res64; -+ return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) -+{ -+ int16x4_t res64; -+ return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) -+{ -+ int32x2_t res64; -+ return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) -+{ -+ int64x1_t res; -+ if (c ==64) -+ res = a; -+ else{ -+ res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros -+ } -+ return res; -+} -+ -+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+#define vsri_n_u8 vsri_n_s8 -+ -+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+#define vsri_n_u16 vsri_n_s16 -+ -+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+#define vsri_n_u32 vsri_n_s32 -+ -+ -+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+#define vsri_n_u64 vsri_n_s64 -+ -+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+#define vsri_n_p8 vsri_n_u8 -+ -+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+#define vsri_n_p16 vsri_n_u16 -+ -+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8 -+{ -+ __m128i maskA, a_masked; -+ uint8x16_t b_shift; -+ _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used -+ maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros -+ a_masked = _mm_and_si128 (a, maskA); -+ b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift -+ return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a) -+} -+ -+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16 -+{ -+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a -+ uint16x8_t b_shift; -+ uint16x8_t a_c; -+ b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift -+ a_c = vshrq_n_u16( a, (16 - c)); -+ a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a -+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) -+} -+ -+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32 -+{ -+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a -+ uint32x4_t b_shift; -+ uint32x4_t a_c; -+ b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift -+ a_c = vshrq_n_u32( a, (32 - c)); -+ a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a -+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) -+} -+ -+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) -+{ -+ //serial solution may be faster -+ uint64x2_t b_shift; -+ uint64x2_t a_c; -+ b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift -+ a_c = _mm_srli_epi64(a, (64 - c)); -+ a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a -+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) -+} -+ -+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+#define vsriq_n_u8 vsriq_n_s8 -+ -+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+#define vsriq_n_u16 vsriq_n_s16 -+ -+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+#define vsriq_n_u32 vsriq_n_s32 -+ -+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+#define vsriq_n_u64 vsriq_n_s64 -+ -+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+#define vsriq_n_p8 vsriq_n_u8 -+ -+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+#define vsriq_n_p16 vsriq_n_u16 -+ -+//***** Vector shift left and insert ********************************************* -+//********************************************************************************* -+//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift. -+//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted". -+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c) -+{ -+ int8x8_t res64; -+ return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c) -+{ -+ int16x4_t res64; -+ return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c) -+{ -+ int32x2_t res64; -+ return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c)); -+} -+ -+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros -+ return res; -+} -+ -+ -+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+#define vsli_n_u8 vsli_n_s8 -+ -+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+#define vsli_n_u16 vsli_n_s16 -+ -+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+#define vsli_n_u32 vsli_n_s32 -+ -+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+#define vsli_n_u64 vsli_n_s64 -+ -+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+#define vsli_n_p8 vsli_n_u8 -+ -+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+#define vsli_n_p16 vsli_n_u16 -+ -+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0 -+{ -+ __m128i maskA, a_masked; -+ int8x16_t b_shift; -+ _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask -+ maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones -+ b_shift = vshlq_n_s8( b, c); -+ a_masked = _mm_and_si128 (a, maskA); -+ return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a) -+} -+ -+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0 -+{ -+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a -+ int16x8_t b_shift; -+ int16x8_t a_c; -+ b_shift = vshlq_n_s16( b, c); -+ a_c = vshlq_n_s16( a, (16 - c)); -+ a_c = _mm_srli_epi16(a_c, (16 - c)); -+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) -+} -+ -+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0 -+{ -+ //solution may be not optimal compared with the serial one -+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a -+ int32x4_t b_shift; -+ int32x4_t a_c; -+ b_shift = vshlq_n_s32( b, c); -+ a_c = vshlq_n_s32( a, (32 - c)); -+ a_c = _mm_srli_epi32(a_c, (32 - c)); -+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) -+} -+ -+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0 -+{ -+ //solution may be not optimal compared with the serial one -+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a -+ int64x2_t b_shift; -+ int64x2_t a_c; -+ b_shift = vshlq_n_s64( b, c); -+ a_c = vshlq_n_s64( a, (64 - c)); -+ a_c = _mm_srli_epi64(a_c, (64 - c)); -+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) -+} -+ -+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+#define vsliq_n_u8 vsliq_n_s8 -+ -+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+#define vsliq_n_u16 vsliq_n_s16 -+ -+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+#define vsliq_n_u32 vsliq_n_s32 -+ -+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+#define vsliq_n_u64 vsliq_n_s64 -+ -+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+#define vsliq_n_p8 vsliq_n_u8 -+ -+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+#define vsliq_n_p16 vsliq_n_u16 -+ -+// *********************************************************************************************** -+// ****************** Loads and stores of a single vector *************************************** -+// *********************************************************************************************** -+//Performs loads and stores of a single vector of some type. -+//******************************* Loads ******************************************************** -+// *********************************************************************************************** -+//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);. -+//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous. -+// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access -+//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead; -+#define LOAD_SI128(ptr) \ -+ ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)); -+ -+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+#define vld1q_u8 LOAD_SI128 -+ -+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+#define vld1q_u16 LOAD_SI128 -+ -+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+#define vld1q_u32 LOAD_SI128 -+ -+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld1q_u64 LOAD_SI128 -+ -+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+#define vld1q_s8 LOAD_SI128 -+ -+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+#define vld1q_s16 LOAD_SI128 -+ -+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+#define vld1q_s32 LOAD_SI128 -+ -+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld1q_s64 LOAD_SI128 -+ -+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers -+/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0] -+{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); -+__m128 f2; -+f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]); -+}*/ -+ -+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr) -+{ -+ if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned -+ return _mm_load_ps(ptr); -+ else -+ return _mm_loadu_ps(ptr); -+} -+ -+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+#define vld1q_p8 LOAD_SI128 -+ -+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+#define vld1q_p16 LOAD_SI128 -+ -+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] -+#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr)) -+ -+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] -+#define vld1_u16 vld1_u8 -+ -+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] -+#define vld1_u32 vld1_u8 -+ -+ -+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1_u64 vld1_u8 -+ -+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] -+#define vld1_s8 vld1_u8 -+ -+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] -+#define vld1_s16 vld1_u16 -+ -+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] -+#define vld1_s32 vld1_u32 -+ -+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1_s64 vld1_u64 -+ -+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); -+ -+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] -+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = *(ptr); -+ res.m64_f32[1] = *(ptr + 1); -+ return res; -+} -+ -+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] -+#define vld1_p8 vld1_u8 -+ -+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] -+#define vld1_p16 vld1_u16 -+ -+//*********************************************************************************************************** -+//******* Lane load functions - insert the data at vector's given position (lane) ************************* -+//*********************************************************************************************************** -+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) -+ -+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) -+ -+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) -+ -+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] -+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p; -+ -+ -+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) -+ -+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) -+ -+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) -+ -+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane) -+{ -+ //we need to deal with ptr 16bit NOT aligned case -+ __m128 p; -+ p = _mm_set1_ps(*(ptr)); -+ return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane)); -+} -+ -+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] -+#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane) -+ -+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) -+ -+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) -+ -+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] -+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane) -+{ -+ uint8x8_t res; -+ res = vec; -+ res.m64_u8[lane] = *(ptr); -+ return res; -+} -+ -+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane) -+{ -+ uint16x4_t res; -+ res = vec; -+ res.m64_u16[lane] = *(ptr); -+ return res; -+} -+ -+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane) -+{ -+ uint32x2_t res; -+ res = vec; -+ res.m64_u32[lane] = *(ptr); -+ return res; -+} -+ -+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] -+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = *(ptr); -+ return res; -+} -+ -+ -+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane) -+ -+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane) -+ -+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane) -+ -+float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane) -+{ -+ float32x2_t res; -+ res = vec; -+ res.m64_f32[lane] = *(ptr); -+ return res; -+} -+ -+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] -+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane) -+ -+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1_lane_p8 vld1_lane_u8 -+ -+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1_lane_p16 vld1_lane_s16 -+ -+// ****************** Load single value ( set all lanes of vector with same value from memory)********************** -+// ****************************************************************************************************************** -+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr)) -+ -+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr)) -+ -+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr)) -+ -+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)}; -+ return LOAD_SI128(val); -+} -+ -+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr)) -+ -+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr)) -+ -+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr)) -+ -+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr) -+ -+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+//current IA SIMD doesn't support float16, need to go to 32 bits -+ -+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr)) -+ -+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr)) -+ -+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr)) -+ -+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint8x8_t res; -+ int i; -+ for(i = 0; i<8; i++) { -+ res.m64_u8[i] = *(ptr); -+ } -+ return res; -+} -+ -+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint16x4_t res; -+ int i; -+ for(i = 0; i<4; i++) { -+ res.m64_u16[i] = *(ptr); -+ } -+ return res; -+} -+ -+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = *(ptr); -+ res.m64_u32[1] = *(ptr); -+ return res; -+} -+ -+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = *(ptr); -+ return res; -+} -+ -+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr) -+ -+ -+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr) -+ -+ -+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr) -+ -+ -+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr) -+ -+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = *(ptr); -+ res.m64_f32[1] = res.m64_f32[0]; -+ return res; // use last 64bits only -+} -+ -+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1_dup_p8 vld1_dup_u8 -+ -+ -+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1_dup_p16 vld1_dup_u16 -+ -+ -+//************************************************************************************* -+//********************************* Store ********************************************** -+//************************************************************************************* -+// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val); -+//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro -+#define STORE_SI128(ptr, val) \ -+ (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); -+ -+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] -+#define vst1q_u8 STORE_SI128 -+ -+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] -+#define vst1q_u16 STORE_SI128 -+ -+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] -+#define vst1q_u32 STORE_SI128 -+ -+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] -+#define vst1q_u64 STORE_SI128 -+ -+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] -+#define vst1q_s8 STORE_SI128 -+ -+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] -+#define vst1q_s16 STORE_SI128 -+ -+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] -+#define vst1q_s32 STORE_SI128 -+ -+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] -+#define vst1q_s64 STORE_SI128 -+ -+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val) -+{ -+ if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned -+ _mm_store_ps (ptr, val); -+ else -+ _mm_storeu_ps (ptr, val); -+} -+ -+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] -+#define vst1q_p8 vst1q_u8 -+ -+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] -+#define vst1q_p16 vst1q_u16 -+ -+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val) -+{ -+ int i; -+ for (i = 0; i<8; i++) { -+ *(ptr + i) = ((uint8_t*)&val)[i]; -+ } -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val) -+{ -+ int i; -+ for (i = 0; i<4; i++) { -+ *(ptr + i) = ((uint16_t*)&val)[i]; -+ } -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val) -+{ -+ int i; -+ for (i = 0; i<2; i++) { -+ *(ptr + i) = ((uint32_t*)&val)[i]; -+ } -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val) -+{ -+ *(ptr) = *((uint64_t*)&val); -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] -+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val) -+ -+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] -+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val) -+ -+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] -+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val) -+ -+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] -+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val) -+ -+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val) -+{ -+ *(ptr) = val.m64_f32[0]; -+ *(ptr + 1) = val.m64_f32[1]; -+ return; -+} -+ -+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] -+#define vst1_p8 vst1_u8 -+ -+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] -+#define vst1_p16 vst1_u16 -+ -+//***********Store a lane of a vector into memory (extract given lane) ********************* -+//****************************************************************************************** -+void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) -+ -+void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) -+ -+void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] -+#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) -+ -+void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] -+#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) -+ -+void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) -+ -+void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) -+ -+void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] -+#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) -+ -+void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] -+#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) -+ -+void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane) -+{ -+ int32_t ilane; -+ ilane = _MM_EXTRACT_PS(val,lane); -+ *(ptr) = *((float*)&ilane); -+} -+ -+void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1q_lane_p8 vst1q_lane_u8 -+ -+void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1q_lane_p16 vst1q_lane_s16 -+ -+void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane) -+{ -+ *(ptr) = val.m64_u8[lane]; -+} -+ -+void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val.m64_u16[lane]; -+} -+ -+void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val.m64_u32[lane]; -+} -+ -+void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane) -+{ -+ *(ptr) = val.m64_u64[0]; -+} -+ -+void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane) -+ -+void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane) -+ -+void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] -+#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane) -+ -+ -+void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] -+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane) -+ -+ -+void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val.m64_f32[lane]; -+} -+ -+void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1_lane_p8 vst1_lane_u8 -+ -+void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1_lane_p16 vst1_lane_s16 -+ -+//*********************************************************************************************** -+//**************** Loads and stores of an N-element structure ********************************** -+//*********************************************************************************************** -+//These intrinsics load or store an n-element structure. The array structures are defined in the beginning -+//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions" -+//****************** 2 elements load ********************************************* -+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0] -+{ -+ uint8x16x2_t v; -+ v.val[0] = vld1q_u8(ptr); -+ v.val[1] = vld1q_u8((ptr + 16)); -+ v = vuzpq_s8(v.val[0], v.val[1]); -+ return v; -+} -+ -+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0] -+{ -+ uint16x8x2_t v; -+ v.val[0] = vld1q_u16( ptr); -+ v.val[1] = vld1q_u16( (ptr + 8)); -+ v = vuzpq_s16(v.val[0], v.val[1]); -+ return v; -+} -+ -+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0] -+{ -+ uint32x4x2_t v; -+ v.val[0] = vld1q_u32 ( ptr); -+ v.val[1] = vld1q_u32 ( (ptr + 4)); -+ v = vuzpq_s32(v.val[0], v.val[1]); -+ return v; -+} -+ -+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); -+#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr) -+ -+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr) -+ -+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr) -+ -+ -+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0] -+{ -+ float32x4x2_t v; -+ v.val[0] = vld1q_f32 (ptr); -+ v.val[1] = vld1q_f32 ((ptr + 4)); -+ v = vuzpq_f32(v.val[0], v.val[1]); -+ return v; -+} -+ -+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+#define vld2q_p8 vld2q_u8 -+ -+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+#define vld2q_p16 vld2q_u16 -+ -+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr) -+{ -+ uint8x8x2_t v; -+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; -+ __m128i ld128; -+ ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit -+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd); -+ vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); -+ return v; -+} -+ -+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr) -+{ -+ _NEON2SSE_ALIGN_16 uint16x4x2_t v; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; -+ __m128i ld128; -+ ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit -+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd); -+ vst1q_u16((v.val), ld128); -+ return v; -+} -+ -+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr) -+{ -+ _NEON2SSE_ALIGN_16 uint32x2x2_t v; -+ __m128i ld128; -+ ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit -+ ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ vst1q_u32((v.val), ld128); -+ return v; -+} -+ -+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr) -+{ -+ uint64x1x2_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ return v; -+} -+ -+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr) -+ -+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr) -+ -+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr) -+ -+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr) -+ -+float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example -+ -+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr) -+{ -+ float32x2x2_t v; -+ v.val[0].m64_f32[0] = *(ptr); -+ v.val[0].m64_f32[1] = *(ptr + 2); -+ v.val[1].m64_f32[0] = *(ptr + 1); -+ v.val[1].m64_f32[1] = *(ptr + 3); -+ return v; -+} -+ -+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+#define vld2_p8 vld2_u8 -+ -+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+#define vld2_p16 vld2_u16 -+ -+//******************** Triplets *************************************** -+//********************************************************************* -+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0] -+{ -+ //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 -> -+ //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13 -+ //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14, -+ //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15 -+ uint8x16x3_t v; -+ __m128i tmp0, tmp1,tmp2, tmp3; -+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14}; -+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13}; -+ _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15}; -+ -+ v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15 -+ v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15 -+ v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15 -+ -+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11 -+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13 -+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15 -+ -+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15 -+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x -+ tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, -+ tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 -+ v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, -+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13, -+ -+ tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, -+ tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 -+ v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 -+ v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13, -+ v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, -+ v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 -+ tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 -+ tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14, -+ -+ tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, -+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, -+ v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 -+ v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 -+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, -+ tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, -+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15, -+ return v; -+} -+ -+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0] -+{ -+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 -+ uint16x8x3_t v; -+ __m128i tmp0, tmp1,tmp2, tmp3; -+ _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; -+ _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13}; -+ _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15}; -+ -+ v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7, -+ v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7 -+ v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7 -+ -+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5, -+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6 -+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7 -+ -+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6, -+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x -+ tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7 -+ tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0 -+ v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5, -+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5 -+ -+ tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7 -+ tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0 -+ v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0 -+ v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6, -+ v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5, -+ v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0, -+ tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0 -+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6, -+ -+ tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0 -+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7, -+ v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0 -+ v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0 -+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7, -+ tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0 -+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7, -+ return v; -+} -+ -+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, -+ uint32x4x3_t v; -+ __m128i tmp0, tmp1,tmp2, tmp3; -+ v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3, -+ v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3 -+ v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3, -+ -+ tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2 -+ tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1 -+ tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3 -+ -+ tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2 -+ v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1 -+ tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1 -+ v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0, -+ v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2 -+ v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3 -+ return v; -+} -+ -+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr)) -+ -+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr)) -+ -+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr)) -+ -+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, -+ float32x4x3_t v; -+ __m128 tmp0, tmp1,tmp2, tmp3; -+ v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3, -+ v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3 -+ v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3, -+ -+ tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2 -+ tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1 -+ tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3 -+ tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2 -+ -+ v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1 -+ tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1 -+ v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0, -+ v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2 -+ v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3 -+ return v; -+} -+ -+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+#define vld3q_p8 vld3q_u8 -+ -+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+#define vld3q_p16 vld3q_u16 -+ -+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0] -+{ -+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 -+ uint8x8x3_t v; -+ __m128i val0, val1, val2, tmp0, tmp1; -+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14}; -+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0}; -+ val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7 -+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7 -+ -+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6, -+ tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x -+ val0 = _mm_slli_si128(tmp0,10); -+ val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0 -+ val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x -+ val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x -+ _M64(v.val[0], val0); -+ val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5, -+ val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0, -+ val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0 -+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0 -+ val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x -+ _M64(v.val[1], val1); -+ -+ tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0, -+ val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0 -+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7, -+ val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0] -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, -+ uint16x4x3_t v; -+ __m128i val0, val1, val2, tmp0, tmp1; -+ _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; -+ val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3 -+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x -+ -+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1 -+ tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3, -+ val0 = _mm_slli_si128(tmp0,10); -+ val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0, -+ val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1 -+ val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0 -+ val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x -+ _M64(v.val[0], val0); -+ -+ val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3 -+ val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0, -+ val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0, -+ val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0 -+ val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x -+ _M64(v.val[1], val1); -+ -+ tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0 -+ tmp1 = _mm_srli_si128(tmp1,4); -+ tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3, -+ val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3, -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0] -+{ -+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 -+ uint32x2x3_t v; -+ __m128i val0, val1, val2; -+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, -+ val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x -+ -+ val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0 -+ _M64(v.val[0], val0); -+ val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1, -+ val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1 -+ _M64(v.val[1], val1); -+ val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x, -+ _M64(v.val[2], val2); -+ return v; -+} -+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] -+{ -+ uint64x1x3_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ v.val[2].m64_u64[0] = *(ptr + 2); -+ return v; -+} -+ -+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr) -+ -+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr) -+ -+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr) -+ -+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr) -+ -+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr) -+{ -+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 -+ float32x2x3_t v; -+ v.val[0].m64_f32[0] = *(ptr); -+ v.val[0].m64_f32[1] = *(ptr + 3); -+ -+ v.val[1].m64_f32[0] = *(ptr + 1); -+ v.val[1].m64_f32[1] = *(ptr + 4); -+ -+ v.val[2].m64_f32[0] = *(ptr + 2); -+ v.val[2].m64_f32[1] = *(ptr + 5); -+ return v; -+} -+ -+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+#define vld3_p8 vld3_u8 -+ -+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+#define vld3_p16 vld3_u16 -+ -+//*************** Quadruples load ******************************** -+//***************************************************************** -+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0] -+{ -+ uint8x16x4_t v; -+ __m128i tmp3, tmp2, tmp1, tmp0; -+ -+ v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15 -+ v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15 -+ v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15 -+ v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15 -+ -+ tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7 -+ tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7 -+ tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15 -+ tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15 -+ -+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11 -+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15 -+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11 -+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15 -+ -+ tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9 -+ tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11 -+ tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13, -+ tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15 -+ -+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12 -+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13 -+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14 -+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15 -+ return v; -+} -+ -+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0] -+{ -+ uint16x8x4_t v; -+ __m128i tmp3, tmp2, tmp1, tmp0; -+ tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7 -+ tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7 -+ tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7 -+ tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7 -+ v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3, -+ v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3, -+ v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7 -+ v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7 -+ tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5 -+ tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7 -+ tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5 -+ tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7 -+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4, -+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5 -+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6, -+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7 -+ return v; -+} -+ -+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] -+{ -+ uint32x4x4_t v; -+ __m128i tmp3, tmp2, tmp1, tmp0; -+ v.val[0] = vld1q_u32 (ptr); -+ v.val[1] = vld1q_u32 ((ptr + 4)); -+ v.val[2] = vld1q_u32 ((ptr + 8)); -+ v.val[3] = vld1q_u32 ((ptr + 12)); -+ tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]); -+ tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]); -+ tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]); -+ tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]); -+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1); -+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1); -+ v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3); -+ v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3); -+ return v; -+} -+ -+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr) -+ -+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr) -+ -+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr) -+ -+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] -+{ -+ float32x4x4_t v; -+ __m128 tmp3, tmp2, tmp1, tmp0; -+ -+ v.val[0] = vld1q_f32 ((float*) ptr); -+ v.val[1] = vld1q_f32 ((float*) (ptr + 4)); -+ v.val[2] = vld1q_f32 ((float*) (ptr + 8)); -+ v.val[3] = vld1q_f32 ((float*) (ptr + 12)); -+ tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]); -+ tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]); -+ tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]); -+ tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]); -+ v.val[0] = _mm_movelh_ps(tmp0, tmp2); -+ v.val[1] = _mm_movehl_ps(tmp2, tmp0); -+ v.val[2] = _mm_movelh_ps(tmp1, tmp3); -+ v.val[3] = _mm_movehl_ps(tmp3, tmp1); -+ return v; -+} -+ -+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+#define vld4q_p8 vld4q_u8 -+ -+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+#define vld4q_p16 vld4q_s16 -+ -+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0] -+{ -+ uint8x8x4_t v; -+ __m128i sh0, sh1; -+ __m128i val0, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; -+ -+ val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1] -+ val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3] -+ -+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8); -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8); -+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29 -+ vst1q_u8(&v.val[0], val0 ); -+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31 -+ vst1q_u8(&v.val[2], val2 ); -+ return v; -+} -+ -+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0] -+{ -+ uint16x4x4_t v; -+ __m128i sh0, sh1; -+ __m128i val0, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7 -+ val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1] -+ val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3] -+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16); -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16); -+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13 -+ vst1q_u16(&v.val[0], val0 ); -+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15 -+ vst1q_u16(&v.val[2], val2 ); -+ return v; -+} -+ -+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr) -+{ -+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 -+ uint32x2x4_t v; -+ __m128i val0, val01, val2; -+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, -+ val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1 -+ val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1, -+ val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1 -+ vst1q_u32(&v.val[0], val01); -+ vst1q_u32(&v.val[2], val2 ); -+ return v; -+} -+ -+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] -+{ -+ uint64x1x4_t v; -+ v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1] -+ v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1] -+ v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3] -+ v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3] -+ return v; -+} -+ -+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr) -+ -+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr) -+ -+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr) -+ -+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr) -+ -+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0] -+{ -+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 -+ float32x2x4_t res; -+ res.val[0].m64_f32[0] = *(ptr); -+ res.val[0].m64_f32[1] = *(ptr + 4); -+ res.val[1].m64_f32[0] = *(ptr + 1); -+ res.val[1].m64_f32[1] = *(ptr + 5); -+ res.val[2].m64_f32[0] = *(ptr + 2); -+ res.val[2].m64_f32[1] = *(ptr + 6); -+ res.val[3].m64_f32[0] = *(ptr + 3); -+ res.val[3].m64_f32[1] = *(ptr + 7); -+ return res; -+} -+ -+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+#define vld4_p8 vld4_u8 -+ -+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+#define vld4_p16 vld4_u16 -+ -+//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes ******************* -+//******************************************************************************************************************* -+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0] -+{ -+ uint8x8x2_t v; -+ __m128i val0, val1; -+ val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x -+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x, -+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x -+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, -+ vst1q_u8(v.val, val0); -+ return v; -+} -+ -+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0] -+{ -+ uint16x4x2_t v; -+ __m128i val0, val1; -+ val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x -+ val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0) -+ _M64(v.val[0], val0); -+ val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1) -+ _M64(v.val[1], val1); -+ return v; -+} -+ -+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] -+{ -+ uint32x2x2_t v; -+ __m128i val0; -+ val0 = LOAD_SI128(ptr); //0,1,x,x -+ val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1 -+ vst1q_u32(v.val, val0); -+ return v; -+} -+ -+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld2_dup_u64 vld2_u64 -+ -+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr) -+ -+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr) -+ -+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr) -+ -+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr) -+ -+float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] -+{ -+ float32x2x2_t v; -+ v.val[0].m64_f32[0] = *(ptr); //0,0 -+ v.val[0].m64_f32[1] = *(ptr); //0,0 -+ v.val[1].m64_f32[0] = *(ptr + 1); //1,1 -+ v.val[1].m64_f32[1] = *(ptr + 1); //1,1 -+ return v; -+} -+ -+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+#define vld2_dup_p8 vld2_dup_u8 -+ -+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+#define vld2_dup_p16 vld2_dup_s16 -+ -+//************* Duplicate (or propagate)triplets: ******************* -+//******************************************************************** -+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes -+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0] -+{ -+ uint8x8x3_t v; -+ __m128i val0, val1, val2; -+ val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x -+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x, -+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x, -+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, -+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x, -+ vst1q_u8(v.val, val0); -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0] -+{ -+ uint16x4x3_t v; -+ __m128i val0, val1, val2; -+ val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x -+ val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0) -+ val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1) -+ val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2) -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] -+{ -+ uint32x2x3_t v; -+ __m128i val0, val1, val2; -+ val2 = LOAD_SI128(ptr); //0,1,2,x -+ val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2 -+ val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2 -+ val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0 -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] -+{ -+ uint64x1x3_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ v.val[2].m64_u64[0] = *(ptr + 2); -+ return v; -+} -+ -+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr) -+ -+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr) -+ -+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr) -+ -+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr) -+ -+ -+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] -+{ -+ float32x2x3_t v; -+ int i; -+ for (i = 0; i<3; i++) { -+ v.val[i].m64_f32[0] = *(ptr + i); -+ v.val[i].m64_f32[1] = *(ptr + i); -+ } -+ return v; -+} -+ -+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_p8 vld3_dup_u8 -+ -+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_p16 vld3_dup_s16 -+ -+ -+//************* Duplicate (or propagate) quadruples: ******************* -+//*********************************************************************** -+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes -+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ uint8x8x4_t v; -+ __m128i val0, val1, val2; -+ val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x -+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x, -+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3 -+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, -+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3 -+ vst1q_u8(&v.val[0], val0); -+ vst1q_u8(&v.val[2], val2); -+ return v; -+} -+ -+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ uint16x4x4_t v; -+ __m128i val0, val1, val2, val3; -+ val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x -+ val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0) -+ val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1) -+ val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2) -+ val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3) -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ _M64(v.val[3], val3); -+ return v; -+} -+ -+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ uint32x2x4_t v; -+ __m128i val0, val1, val2, val3; -+ val3 = LOAD_SI128(ptr); //0,1,2,3 -+ val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3 -+ val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3 -+ val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3 -+ val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2 -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ _M64(v.val[3], val3); -+ return v; -+} -+ -+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] -+{ -+ uint64x1x4_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ v.val[2].m64_u64[0] = *(ptr + 2); -+ v.val[3].m64_u64[0] = *(ptr + 3); -+ return v; -+} -+ -+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr) -+ -+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr) -+ -+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr) -+ -+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr) -+ -+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ float32x2x4_t v; -+ int i; -+ for (i = 0; i<4; i++) { -+ v.val[i].m64_f32[0] = *(ptr + i); -+ v.val[i].m64_f32[1] = *(ptr + i); -+ } -+ return v; -+} -+ -+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_p8 vld4_dup_u8 -+ -+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_p16 vld4_dup_u16 -+ -+ -+//********************************************************************************** -+//*******************Lane loads for an N-element structures *********************** -+//********************************************************************************** -+//********************** Lane pairs ************************************************ -+//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon -+//we assume src is 16 bit aligned -+ -+//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error -+//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined -+ -+//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0] -+{ -+ uint16x8x2_t v; -+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); -+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] -+{ -+ uint32x4x2_t v; -+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); -+ return v; -+} -+#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane) -+ -+//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane) -+{ -+ int16x8x2_t v; -+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); -+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane) -+ -+//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane) -+{ -+ int32x4x2_t v; -+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); -+ return v; -+} -+#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane) -+ -+//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] -+{ -+ float32x4x2_t v; -+ v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane) -+ -+//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+#define vld2q_lane_p16 vld2q_lane_u16 -+ -+//uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0] -+{ -+ uint8x8x2_t v; -+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane) -+ -+//uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane) -+{ -+ uint16x4x2_t v; -+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane) -+{ -+ uint32x2x2_t v; -+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane) -+ -+//int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] -+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] -+#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane) -+ -+//int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0] -+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane) -+ -+//int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0] -+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane) -+ -+//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane) -+{ -+ float32x2x2_t v; -+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane) -+ -+//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] -+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] -+#define vld2_lane_p8 vld2_lane_u8 -+ -+//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] -+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+#define vld2_lane_p16 vld2_lane_u16 -+ -+//*********** Lane triplets ********************** -+//************************************************* -+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon -+//we assume src is 16 bit aligned -+ -+//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ uint16x8x3_t v; -+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ uint32x4x3_t v; -+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane) -+ -+//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ int16x8x3_t v; -+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane) -+ -+//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ int32x4x3_t v; -+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane) -+ -+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+//current IA SIMD doesn't support float16 -+#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane) -+ -+ -+//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ float32x4x3_t v; -+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); -+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); -+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); -+ return v; -+} -+#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane) -+ -+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+#define vld3q_lane_p16 vld3q_lane_u16 -+ -+//uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ uint8x8x3_t v; -+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); -+ return v; -+} -+#define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane) -+ -+//uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ uint16x4x3_t v; -+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); -+ return v; -+} -+#define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ //need to merge into 128 bit anyway -+ uint32x2x3_t v; -+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);; -+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);; -+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);; -+ return v; -+} -+#define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane) -+ -+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane) -+ -+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane) -+ -+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane) -+ -+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ float32x2x3_t v; -+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); -+ return v; -+} -+#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane) -+ -+//poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_p8 vld3_lane_u8 -+ -+//poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_p16 vld3_lane_u16 -+ -+//******************* Lane Quadruples load *************************** -+//********************************************************************* -+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon -+//we assume src is 16 bit aligned -+ -+//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane) -+{ -+ uint16x8x4_t v; -+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); -+ v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane); -+ return v; -+} -+#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane) -+{ -+ uint32x4x4_t v; -+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); -+ v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane); -+ return v; -+} -+#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane) -+ -+//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane) -+ -+//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane) -+ -+//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane) -+{ -+ float32x4x4_t v; -+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); -+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); -+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); -+ v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane); -+ return v; -+} -+#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane) -+ -+//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+#define vld4q_lane_p16 vld4q_lane_u16 -+ -+//uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane) -+{ -+ uint8x8x4_t v; -+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane) -+ -+//uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane) -+{ -+ uint16x4x4_t v; -+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane) -+{ -+ uint32x2x4_t v; -+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane) -+ -+//int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); -+#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane) -+ -+//int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); -+#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane) -+ -+//int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); -+#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane) -+ -+//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); -+//current IA SIMD doesn't support float16 -+ -+//float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane) -+{ -+ //serial solution may be faster -+ float32x2x4_t v; -+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane) -+ -+//poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); -+#define vld4_lane_p8 vld4_lane_u8 -+ -+//poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); -+#define vld4_lane_p16 vld4_lane_u16 -+ -+//******************* Store duplets ********************************************* -+//******************************************************************************** -+//here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function -+//If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions -+//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val) -+{ -+ uint8x16x2_t v; -+ v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]); -+ vst1q_u8 (ptr, v.val[0]); -+ vst1q_u8 ((ptr + 16), v.val[1]); -+} -+#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val) -+ -+//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val) -+{ -+ uint16x8x2_t v; -+ v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]); -+ vst1q_u16 (ptr, v.val[0]); -+ vst1q_u16 ((ptr + 8), v.val[1]); -+} -+#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val) -+ -+//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val) -+{ -+ uint32x4x2_t v; -+ v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]); -+ vst1q_u32 (ptr, v.val[0]); -+ vst1q_u32 ((ptr + 4), v.val[1]); -+} -+#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val) -+ -+//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0] -+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); -+#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val) -+ -+//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0] -+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); -+#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val) -+ -+//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0] -+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); -+#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val) -+ -+//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0] -+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val) -+{ -+ float32x4x2_t v; -+ v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]); -+ vst1q_f32 (ptr, v.val[0]); -+ vst1q_f32 ((ptr + 4), v.val[1]); -+} -+#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val) -+ -+//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0] -+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); -+#define vst2q_p8 vst2q_u8 -+ -+//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0] -+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); -+#define vst2q_p16 vst2q_u16 -+ -+//void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val) -+{ -+ __m128i v0; -+ v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ vst1q_u8 (ptr, v0); -+} -+#define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val) -+ -+//void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val) -+{ -+ __m128i v0; -+ v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ vst1q_u16 (ptr, v0); -+} -+#define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val) -+ -+//void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val) -+{ -+ __m128i v0; -+ v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ vst1q_u32 (ptr, v0); -+} -+#define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val) -+ -+ -+//void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0] -+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); -+_NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val) -+{ -+ *(ptr) = val->val[0].m64_u64[0]; -+ *(ptr + 1) = val->val[1].m64_u64[0]; -+} -+#define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val) -+ -+//void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0] -+#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val) -+ -+//void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0] -+#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val) -+ -+//void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0] -+#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val) -+ -+//void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); -+#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val) -+ -+//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val) -+{ -+ *(ptr) = val->val[0].m64_f32[0]; -+ *(ptr + 1) = val->val[1].m64_f32[0]; -+ *(ptr + 2) = val->val[0].m64_f32[1]; -+ *(ptr + 3) = val->val[1].m64_f32[1]; -+} -+#define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val) -+ -+//void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+#define vst2_p8 vst2_u8 -+ -+//void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+#define vst2_p16 vst2_u16 -+ -+//******************** Triplets store ***************************************** -+//****************************************************************************** -+//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val) -+{ -+ uint8x16x3_t v; -+ __m128i v0,v1,v2, cff, bldmask; -+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10}; -+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15}; -+ -+ v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22 -+ v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46 -+ v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34 -+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding -+ cff = _mm_cmpeq_epi8(v0, v0); //all ff -+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff); -+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); -+ vst1q_u8(ptr, v.val[0]); -+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff); -+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); -+ vst1q_u8((ptr + 16), v.val[1]); -+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff); -+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); -+ vst1q_u8((ptr + 32), v.val[2]); -+} -+#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val) -+ -+//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val) -+{ -+ uint16x8x3_t v; -+ __m128i v0,v1,v2, cff, bldmask; -+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11}; -+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15}; -+ -+ v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10 -+ v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22, -+ v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19 -+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding -+ cff = _mm_cmpeq_epi16(v0, v0); //all ff -+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff); -+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); -+ vst1q_u16(ptr, v.val[0]); -+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff); -+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); -+ vst1q_u16((ptr + 8), v.val[1]); -+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff); -+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); -+ vst1q_u16((ptr + 16), v.val[2]); -+} -+#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val) -+ -+//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val) -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3 -+ uint32x4x3_t v; -+ __m128i tmp0, tmp1,tmp2; -+ tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1 -+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3 -+ tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1 -+ v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2, -+ v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3 -+ v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3 -+ tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1 -+ v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1, -+ -+ vst1q_u32(ptr, v.val[0]); -+ vst1q_u32((ptr + 4), v.val[1]); -+ vst1q_u32((ptr + 8), v.val[2]); -+} -+#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val) -+ -+//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val); -+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); -+#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val) -+ -+//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val); -+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); -+#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val) -+ -+//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val); -+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); -+#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val) -+ -+//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] -+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val) -+{ -+ float32x4x3_t v; -+ __m128 tmp0, tmp1,tmp2; -+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1 -+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3 -+ tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1 -+ v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2, -+ v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3 -+ v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3 -+ tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1 -+ v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1, -+ -+ vst1q_f32( ptr, v.val[0]); -+ vst1q_f32( (ptr + 4), v.val[1]); -+ vst1q_f32( (ptr + 8), v.val[2]); -+} -+#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val) -+ -+//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0] -+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); -+#define vst3q_p8 vst3q_u8 -+ -+//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] -+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); -+#define vst3q_p16 vst3q_u16 -+ -+//void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val) -+{ -+ __m128i tmp, sh0, sh1, val0, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5}; -+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0}; -+ _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0}; -+ _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0}; -+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) ); -+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15) -+ val2 = _pM128i(val->val[2]); -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); -+ val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel); -+ vst1q_u8(ptr, val0); //store as 128 bit structure -+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15) -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); -+ val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel); -+ _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory -+} -+#define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val) -+ -+//void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val) -+{ -+ __m128i tmp, val0, val1, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13}; -+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0}; -+ _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1] -+ _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0] -+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); -+ val2 = _pM128i(val->val[2]); -+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); -+ val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f); -+ vst1q_u16(ptr, val0); //store as 128 bit structure -+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); -+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); -+ val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order -+ _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory -+} -+#define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val) -+ -+//void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val) -+{ -+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -+ __m128i val0, val1; -+ val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5 -+ val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5 -+ val1 = _mm_srli_si128(val0, 8); //4,5, x,x -+ _M64((*(__m64_128*)(ptr + 4)), val1); -+ val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2 -+ val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3 -+ vst1q_u32(ptr, val0); //store as 128 bit structure -+} -+#define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val) -+ -+//void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val) -+{ -+ *(ptr) = val->val[0].m64_u64[0]; -+ *(ptr + 1) = val->val[1].m64_u64[0]; -+ *(ptr + 2) = val->val[2].m64_u64[0]; -+} -+#define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val) -+ -+//void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0] -+#define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val) -+ -+//void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0] -+#define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val) -+ -+//void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] -+#define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val) -+ -+//void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0] -+#define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val) -+ -+//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] -+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+//void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val) -+{ -+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5 -+ *(ptr) = val->val[0].m64_f32[0]; -+ *(ptr + 1) = val->val[1].m64_f32[0]; -+ *(ptr + 2) = val->val[2].m64_f32[0]; -+ *(ptr + 3) = val->val[0].m64_f32[1]; -+ *(ptr + 4) = val->val[1].m64_f32[1]; -+ *(ptr + 5) = val->val[2].m64_f32[1]; -+} -+#define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val) -+ -+//void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0] -+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); -+#define vst3_p8 vst3_u8 -+ -+//void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] -+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); -+#define vst3_p16 vst3_s16 -+ -+//*************** Quadruples store ******************************** -+//********************************************************************* -+//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val) -+{ -+ __m128i tmp1, tmp2, res; -+ tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 -+ tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 -+ res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15 -+ vst1q_u8(ptr, res); -+ res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31 -+ vst1q_u8((ptr + 16), res); -+ tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); // -+ tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); // -+ res = _mm_unpacklo_epi16(tmp1, tmp2); // -+ vst1q_u8((ptr + 32), res); -+ res = _mm_unpackhi_epi16(tmp1, tmp2); // -+ vst1q_u8((ptr + 48), res); -+} -+#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val) -+ -+//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val) -+{ -+ uint16x8x4_t v; -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2); -+ v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2); -+ tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2); -+ v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2); -+ vst1q_u16(ptr, v.val[0]); -+ vst1q_u16((ptr + 8), v.val[1]); -+ vst1q_u16((ptr + 16),v.val[2]); -+ vst1q_u16((ptr + 24), v.val[3]); -+} -+#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val) -+ -+//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val) -+{ -+ uint16x8x4_t v; -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2); -+ v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2); -+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2); -+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2); -+ vst1q_u32(ptr, v.val[0]); -+ vst1q_u32((ptr + 4), v.val[1]); -+ vst1q_u32((ptr + 8), v.val[2]); -+ vst1q_u32((ptr + 12), v.val[3]); -+} -+#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val) -+ -+//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val); -+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); -+#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val) -+ -+//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val); -+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); -+#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val) -+ -+//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val); -+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); -+#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val) -+ -+//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val) -+{ -+ __m128 tmp3, tmp2, tmp1, tmp0; -+ float32x4x4_t v; -+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); -+ tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]); -+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); -+ tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]); -+ v.val[0] = _mm_movelh_ps(tmp0, tmp2); -+ v.val[1] = _mm_movehl_ps(tmp2, tmp0); -+ v.val[2] = _mm_movelh_ps(tmp1, tmp3); -+ v.val[3] = _mm_movehl_ps(tmp3, tmp1); -+ vst1q_f32(ptr, v.val[0]); -+ vst1q_f32((ptr + 4), v.val[1]); -+ vst1q_f32((ptr + 8), v.val[2]); -+ vst1q_f32((ptr + 12), v.val[3]); -+} -+#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val) -+ -+//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); -+#define vst4q_p8 vst4q_u8 -+ -+//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); -+#define vst4q_p16 vst4q_s16 -+ -+//void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val) -+{ -+ __m128i sh0, sh1, val0, val2; -+ sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7, -+ sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7 -+ val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3, -+ val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7 -+ vst1q_u8(ptr, val0); -+ vst1q_u8((ptr + 16), val2); -+} -+#define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val) -+ -+//void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val) -+{ -+ __m128i sh0, sh1, val0, val2; -+ sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1, -+ sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3 -+ val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3 -+ val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3 -+ vst1q_u16(ptr, val0); //store as 128 bit structure -+ vst1q_u16((ptr + 8), val2); -+} -+#define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val) -+ -+//void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val) -+{ -+ //0,4, 1,5, 2,6, 3,7 -+ __m128i sh0, sh1, val0, val1; -+ sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5 -+ sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7 -+ val0 = _mm_unpacklo_epi64(sh0,sh1); // -+ val1 = _mm_unpackhi_epi64(sh0,sh1); // -+ vst1q_u32(ptr, val0); //store as 128 bit structure -+ vst1q_u32((ptr + 4), val1); -+} -+#define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val) -+ -+//void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val) -+{ -+ *(ptr) = val->val[0].m64_u64[0]; -+ *(ptr + 1) = val->val[1].m64_u64[0]; -+ *(ptr + 2) = val->val[2].m64_u64[0]; -+ *(ptr + 3) = val->val[3].m64_u64[0]; -+} -+#define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val) -+ -+//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0] -+#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val) -+ -+//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0] -+#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val) -+ -+//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0] -+#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val) -+ -+//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] -+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); -+#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val) -+ -+//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+//void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val) -+{ -+ //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7 -+ *(ptr) = val->val[0].m64_f32[0]; -+ *(ptr + 1) = val->val[1].m64_f32[0]; -+ *(ptr + 2) = val->val[2].m64_f32[0]; -+ *(ptr + 3) = val->val[3].m64_f32[0]; -+ *(ptr + 4) = val->val[0].m64_f32[1]; -+ *(ptr + 5) = val->val[1].m64_f32[1]; -+ *(ptr + 6) = val->val[2].m64_f32[1]; -+ *(ptr + 7) = val->val[3].m64_f32[1]; -+} -+#define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val) -+ -+//void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); -+#define vst4_p8 vst4_u8 -+ -+//void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); -+#define vst4_p16 vst4_u16 -+ -+//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors ********************* -+//******************************************************************************************************************** -+//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane) -+{ -+ vst1q_lane_s16(ptr, val->val[0], lane); -+ vst1q_lane_s16((ptr + 1), val->val[1], lane); -+} -+#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_u32(ptr, val->val[0], lane); -+ vst1q_lane_u32((ptr + 1), val->val[1], lane); -+} -+#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] -+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); -+#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane) -+ -+//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0] -+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); -+#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane) -+ -+//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] -+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_f32(ptr, val->val[0], lane); -+ vst1q_lane_f32((ptr + 1), val->val[1], lane); -+} -+#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane) -+ -+//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] -+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); -+#define vst2q_lane_p16 vst2q_lane_s16 -+ -+//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] -+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0] -+{ -+ *(ptr) = val->val[0].m64_u8[lane]; -+ *(ptr + 1) = val->val[1].m64_u8[lane]; -+} -+#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane) -+ -+//void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] -+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val->val[0].m64_u16[lane]; -+ *(ptr + 1) = val->val[1].m64_u16[lane]; -+} -+#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] -+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_u32[lane]; -+ *(ptr + 1) = val->val[1].m64_u32[lane]; -+} -+#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] -+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); -+#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane) -+ -+//void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] -+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); -+#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane) -+ -+//void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] -+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); -+#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane) -+ -+//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_f32[lane]; -+ *(ptr + 1) = val->val[1].m64_f32[lane]; -+} -+#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane) -+ -+//void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] -+#define vst2_lane_p8 vst2_lane_u8 -+ -+//void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] -+#define vst2_lane_p16 vst2_lane_u16 -+ -+//************************* Triple lanes stores ******************************************************* -+//******************************************************************************************************* -+//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane) -+{ -+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane); -+ vst1q_lane_u16((ptr + 2), val->val[2], lane); -+} -+#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane) -+{ -+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane); -+ vst1q_lane_u32((ptr + 2), val->val[2], lane); -+} -+#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); -+#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane) -+ -+//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); -+#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane) -+ -+//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_f32(ptr, val->val[0], lane); -+ vst1q_lane_f32((ptr + 1), val->val[1], lane); -+ vst1q_lane_f32((ptr + 2), val->val[2], lane); -+} -+#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); -+#define vst3q_lane_p16 vst3q_lane_s16 -+ -+//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane) -+{ -+ *(ptr) = val->val[0].m64_u8[lane]; -+ *(ptr + 1) = val->val[1].m64_u8[lane]; -+ *(ptr + 2) = val->val[2].m64_u8[lane]; -+} -+#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane) -+ -+//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val->val[0].m64_u16[lane]; -+ *(ptr + 1) = val->val[1].m64_u16[lane]; -+ *(ptr + 2) = val->val[2].m64_u16[lane]; -+} -+#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_u32[lane]; -+ *(ptr + 1) = val->val[1].m64_u32[lane]; -+ *(ptr + 2) = val->val[2].m64_u32[lane]; -+} -+#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); -+#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane) -+ -+//void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); -+#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane) -+ -+//void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); -+#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane) -+ -+//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); -+_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_f32[lane]; -+ *(ptr + 1) = val->val[1].m64_f32[lane]; -+ *(ptr + 2) = val->val[2].m64_f32[lane]; -+} -+#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); -+#define vst3_lane_p8 vst3_lane_u8 -+ -+//void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); -+#define vst3_lane_p16 vst3_lane_s16 -+ -+//******************************** Quadruple lanes stores *********************************************** -+//******************************************************************************************************* -+//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane) -+{ -+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane); -+ vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane); -+} -+#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane) -+{ -+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane); -+ vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane); -+} -+#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); -+#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane) -+ -+//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); -+#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane) -+ -+//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_f32(ptr, val->val[0], lane); -+ vst1q_lane_f32((ptr + 1), val->val[1], lane); -+ vst1q_lane_f32((ptr + 2), val->val[2], lane); -+ vst1q_lane_f32((ptr + 3), val->val[3], lane); -+} -+#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); -+#define vst4q_lane_p16 vst4q_lane_u16 -+ -+//void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane) -+{ -+ *(ptr) = val->val[0].m64_u8[lane]; -+ *(ptr + 1) = val->val[1].m64_u8[lane]; -+ *(ptr + 2) = val->val[2].m64_u8[lane]; -+ *(ptr + 3) = val->val[3].m64_u8[lane]; -+} -+#define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane) -+ -+//void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val->val[0].m64_u16[lane]; -+ *(ptr + 1) = val->val[1].m64_u16[lane]; -+ *(ptr + 2) = val->val[2].m64_u16[lane]; -+ *(ptr + 3) = val->val[3].m64_u16[lane]; -+} -+#define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_u32[lane]; -+ *(ptr + 1) = val->val[1].m64_u32[lane]; -+ *(ptr + 2) = val->val[2].m64_u32[lane]; -+ *(ptr + 3) = val->val[3].m64_u32[lane]; -+} -+#define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane) -+ -+//void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane) -+ -+//void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane) -+ -+//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); -+//current IA SIMD doesn't support float16 -+ -+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_f32[lane]; -+ *(ptr + 1) = val->val[1].m64_f32[lane]; -+ *(ptr + 2) = val->val[2].m64_f32[lane]; -+ *(ptr + 3) = val->val[3].m64_f32[lane]; -+} -+#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); -+#define vst4_lane_p8 vst4_lane_u8 -+ -+//void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); -+#define vst4_lane_p16 vst4_lane_u16 -+ -+//************************************************************************************************** -+//************************ Extract lanes from a vector ******************************************** -+//************************************************************************************************** -+//These intrinsics extract a single lane (element) from a vector. -+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+#define vget_lane_u8(vec, lane) vec.m64_u8[lane] -+ -+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] -+#define vget_lane_u16(vec, lane) vec.m64_u16[lane] -+ -+ -+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+#define vget_lane_u32(vec, lane) vec.m64_u32[lane] -+ -+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] -+#define vget_lane_s8(vec, lane) vec.m64_i8[lane] -+ -+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] -+#define vget_lane_s16(vec, lane) vec.m64_i16[lane] -+ -+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+#define vget_lane_s32(vec, lane) vec.m64_i32[lane] -+ -+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+#define vget_lane_p8 vget_lane_u8 -+ -+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] -+#define vget_lane_p16 vget_lane_u16 -+ -+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+#define vget_lane_f32(vec, lane) vec.m64_f32[lane] -+ -+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+#define vgetq_lane_u8 _MM_EXTRACT_EPI8 -+ -+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] -+#define vgetq_lane_u16 _MM_EXTRACT_EPI16 -+ -+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+#define vgetq_lane_u32 _MM_EXTRACT_EPI32 -+ -+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] -+#define vgetq_lane_s8 vgetq_lane_u8 -+ -+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] -+#define vgetq_lane_s16 vgetq_lane_u16 -+ -+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+#define vgetq_lane_s32 vgetq_lane_u32 -+ -+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+#define vgetq_lane_p8 vgetq_lane_u8 -+ -+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] -+#define vgetq_lane_p16 vgetq_lane_u16 -+ -+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane) -+{ -+ int32_t ilane; -+ ilane = _MM_EXTRACT_PS(vec,lane); -+ return *(float*)&ilane; -+} -+ -+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+#define vget_lane_s64(vec, lane) vec.m64_i64[0] -+ -+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+#define vget_lane_u64(vec, lane) vec.m64_u64[0] -+ -+ -+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+#define vgetq_lane_s64 (int64_t) vgetq_lane_u64 -+ -+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+#define vgetq_lane_u64 _MM_EXTRACT_EPI64 -+ -+// ***************** Set lanes within a vector ******************************************** -+// ************************************************************************************** -+//These intrinsics set a single lane (element) within a vector. -+//same functions as vld1_lane_xx ones, but take the value to be set directly. -+ -+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane) -+{ -+ uint8_t val; -+ val = value; -+ return vld1_lane_u8(&val, vec, lane); -+} -+ -+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane) -+{ -+ uint16_t val; -+ val = value; -+ return vld1_lane_u16(&val, vec, lane); -+} -+ -+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane) -+{ -+ uint32_t val; -+ val = value; -+ return vld1_lane_u32(&val, vec, lane); -+} -+ -+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane) -+{ -+ int8_t val; -+ val = value; -+ return vld1_lane_s8(&val, vec, lane); -+} -+ -+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane) -+{ -+ int16_t val; -+ val = value; -+ return vld1_lane_s16(&val, vec, lane); -+} -+ -+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane) -+{ -+ int32_t val; -+ val = value; -+ return vld1_lane_s32(&val, vec, lane); -+} -+ -+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+#define vset_lane_p8 vset_lane_u8 -+ -+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+#define vset_lane_p16 vset_lane_u16 -+ -+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane) -+{ -+ float32_t val; -+ val = value; -+ return vld1_lane_f32(&val, vec, lane); -+} -+ -+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane) -+{ -+ uint8_t val; -+ val = value; -+ return vld1q_lane_u8(&val, vec, lane); -+} -+ -+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane) -+{ -+ uint16_t val; -+ val = value; -+ return vld1q_lane_u16(&val, vec, lane); -+} -+ -+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane) -+{ -+ uint32_t val; -+ val = value; -+ return vld1q_lane_u32(&val, vec, lane); -+} -+ -+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane) -+{ -+ int8_t val; -+ val = value; -+ return vld1q_lane_s8(&val, vec, lane); -+} -+ -+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane) -+{ -+ int16_t val; -+ val = value; -+ return vld1q_lane_s16(&val, vec, lane); -+} -+ -+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane) -+{ -+ int32_t val; -+ val = value; -+ return vld1q_lane_s32(&val, vec, lane); -+} -+ -+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+#define vsetq_lane_p8 vsetq_lane_u8 -+ -+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+#define vsetq_lane_p16 vsetq_lane_u16 -+ -+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane) -+{ -+ float32_t val; -+ val = value; -+ return vld1q_lane_f32(&val, vec, lane); -+} -+ -+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane) -+{ -+ int64_t val; -+ val = value; -+ return vld1_lane_s64(&val, vec, lane); -+} -+ -+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane) -+{ -+ uint64_t val; -+ val = value; -+ return vld1_lane_u64(&val, vec, lane); -+} -+ -+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane) -+{ -+ uint64_t val; -+ val = value; -+ return vld1q_lane_s64(&val, vec, lane); -+} -+ -+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+#define vsetq_lane_u64 vsetq_lane_s64 -+ -+// ******************************************************************************* -+// **************** Initialize a vector from bit pattern *************************** -+// ******************************************************************************* -+//These intrinsics create a vector from a literal bit pattern. -+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s8(a) (*(__m64_128*)&(a)) -+ -+ -+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s16 vcreate_s8 -+ -+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s32 vcreate_s8 -+ -+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 -+//no IA32 SIMD avalilable -+ -+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_f32(a) (*(__m64_128*)&(a)) -+ -+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u8 vcreate_s8 -+ -+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u16 vcreate_s16 -+ -+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u32 vcreate_s32 -+ -+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u64 vcreate_s8 -+ -+ -+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_p8 vcreate_u8 -+ -+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_p16 vcreate_u16 -+ -+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s64 vcreate_u64 -+ -+//********************* Set all lanes to same value ******************************** -+//********************************************************************************* -+//These intrinsics set all lanes to the same value. -+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint8x8_t res; -+ int i; -+ for (i = 0; i<8; i++) { -+ res.m64_u8[i] = value; -+ } -+ return res; -+} -+ -+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint16x4_t res; -+ int i; -+ for (i = 0; i<4; i++) { -+ res.m64_u16[i] = value; -+ } -+ return res; -+} -+ -+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = value; -+ res.m64_u32[1] = value; -+ return res; -+} -+ -+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int8x8_t res; -+ int i; -+ for (i = 0; i<8; i++) { -+ res.m64_i8[i] = value; -+ } -+ return res; -+} -+ -+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int16x4_t res; -+ int i; -+ for (i = 0; i<4; i++) { -+ res.m64_i16[i] = value; -+ } -+ return res; -+} -+ -+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t res; -+ res.m64_i32[0] = value; -+ res.m64_i32[1] = value; -+ return res; -+} -+ -+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 -+#define vdup_n_p8 vdup_n_u8 -+ -+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 -+#define vdup_n_p16 vdup_n_s16 -+ -+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 -+_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = value; -+ res.m64_f32[1] = value; -+ return res; -+} -+ -+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value)) -+ -+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value)) -+ -+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value)) -+ -+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 -+#define vdupq_n_s8 _mm_set1_epi8 -+ -+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 -+#define vdupq_n_s16 _mm_set1_epi16 -+ -+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 -+#define vdupq_n_s32 _mm_set1_epi32 -+ -+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+#define vdupq_n_p8 vdupq_n_u8 -+ -+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+#define vdupq_n_p16 vdupq_n_u16 -+ -+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 -+#define vdupq_n_f32 _mm_set1_ps -+ -+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = value; -+ return res; -+} -+ -+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = value; -+ return res; -+} -+ -+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value) -+{ -+ _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate -+ return LOAD_SI128(value2); -+} -+ -+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate -+ return LOAD_SI128(val); -+} -+ -+//**** Set all lanes to same value ************************ -+//Same functions as above - just aliaces.******************** -+//Probably they reflect the fact that 128-bit functions versions use VMOV instruction ********** -+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 -+#define vmov_n_u8 vdup_n_s8 -+ -+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 -+#define vmov_n_u16 vdup_n_s16 -+ -+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 -+#define vmov_n_u32 vdup_n_u32 -+ -+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 -+#define vmov_n_s8 vdup_n_s8 -+ -+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 -+#define vmov_n_s16 vdup_n_s16 -+ -+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 -+#define vmov_n_s32 vdup_n_s32 -+ -+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 -+#define vmov_n_p8 vdup_n_u8 -+ -+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 -+#define vmov_n_p16 vdup_n_s16 -+ -+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 -+#define vmov_n_f32 vdup_n_f32 -+ -+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+#define vmovq_n_u8 vdupq_n_u8 -+ -+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+#define vmovq_n_u16 vdupq_n_s16 -+ -+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+#define vmovq_n_u32 vdupq_n_u32 -+ -+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 -+#define vmovq_n_s8 vdupq_n_s8 -+ -+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 -+#define vmovq_n_s16 vdupq_n_s16 -+ -+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 -+#define vmovq_n_s32 vdupq_n_s32 -+ -+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+#define vmovq_n_p8 vdupq_n_u8 -+ -+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+#define vmovq_n_p16 vdupq_n_s16 -+ -+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 -+#define vmovq_n_f32 vdupq_n_f32 -+ -+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 -+#define vmov_n_s64 vdup_n_s64 -+ -+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 -+#define vmov_n_u64 vdup_n_u64 -+ -+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 -+#define vmovq_n_s64 vdupq_n_s64 -+ -+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+#define vmovq_n_u64 vdupq_n_u64 -+ -+//**************Set all lanes to the value of one lane of a vector ************* -+//**************************************************************************** -+//here shuffle is better solution than lane extraction followed by set1 function -+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) -+{ -+ uint8x8_t res; -+ uint8_t valane; -+ int i = 0; -+ valane = vec.m64_u8[lane]; -+ for (i = 0; i<8; i++) { -+ res.m64_u8[i] = valane; -+ } -+ return res; -+} -+ -+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) -+{ -+ uint16x4_t res; -+ uint16_t valane; -+ valane = vec.m64_u16[lane]; -+ res.m64_u16[0] = valane; -+ res.m64_u16[1] = valane; -+ res.m64_u16[2] = valane; -+ res.m64_u16[3] = valane; -+ return res; -+} -+ -+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = vec.m64_u32[lane]; -+ res.m64_u32[1] = res.m64_u32[0]; -+ return res; -+} -+ -+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+#define vdup_lane_s8 vdup_lane_u8 -+ -+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+#define vdup_lane_s16 vdup_lane_u16 -+ -+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+#define vdup_lane_s32 vdup_lane_u32 -+ -+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+#define vdup_lane_p8 vdup_lane_u8 -+ -+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+#define vdup_lane_p16 vdup_lane_s16 -+ -+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = vec.m64_f32[lane]; -+ res.m64_f32[1] = res.m64_f32[0]; -+ return res; -+} -+ -+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0] -+{ -+ _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane}; -+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8); -+} -+ -+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0] -+{ -+ //we could use 8bit shuffle for 16 bit as well -+ const int8_t lane16 = ((int8_t) lane) << 1; -+ _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, -+ lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1}; -+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16); -+} -+ -+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+#define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6)) -+ -+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+#define vdupq_lane_s8 vdupq_lane_u8 -+ -+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+#define vdupq_lane_s16 vdupq_lane_u16 -+ -+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+#define vdupq_lane_s32 vdupq_lane_u32 -+ -+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+#define vdupq_lane_p8 vdupq_lane_u8 -+ -+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+#define vdupq_lane_p16 vdupq_lane_s16 -+ -+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane)) -+ -+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+#define vdup_lane_s64(vec,lane) vec -+ -+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+#define vdup_lane_u64(vec,lane) vec -+ -+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane) -+{ -+ __m128i vec128; -+ vec128 = _pM128i(vec); -+ return _mm_unpacklo_epi64(vec128,vec128); -+} -+ -+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+#define vdupq_lane_u64 vdupq_lane_s64 -+ -+// ******************************************************************** -+// ******************** Combining vectors ***************************** -+// ******************************************************************** -+//These intrinsics join two 64 bit vectors into a single 128bit vector. -+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 -+#define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 -+#define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 -+#define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 -+#define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 -+//current IA SIMD doesn't support float16 -+ -+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 -+_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high) -+{ -+ __m128i res; -+ res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) ); -+ return _M128(res); -+} -+ -+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 -+#define vcombine_u8 vcombine_s8 -+ -+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 -+#define vcombine_u16 vcombine_s16 -+ -+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 -+#define vcombine_u32 vcombine_s32 -+ -+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 -+#define vcombine_u64 vcombine_s64 -+ -+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 -+#define vcombine_p8 vcombine_u8 -+ -+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 -+#define vcombine_p16 vcombine_u16 -+ -+//********************************************************************** -+//************************* Splitting vectors ************************** -+//********************************************************************** -+//**************** Get high part ****************************************** -+//These intrinsics split a 128 bit vector into 2 component 64 bit vectors -+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a) -+{ -+ int64x1_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a) -+{ -+ __m128i res; -+ __m64_128 res64; -+ res = _mm_unpackhi_epi64(_M128i(a),_M128i(a)); -+ return64(res); -+} -+ -+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 -+#define vget_high_u8 vget_high_s8 -+ -+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 -+#define vget_high_u16 vget_high_s16 -+ -+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 -+#define vget_high_u32 vget_high_s32 -+ -+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 -+#define vget_high_u64 vget_high_s64 -+ -+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 -+#define vget_high_p8 vget_high_u8 -+ -+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 -+#define vget_high_p16 vget_high_u16 -+ -+//********************** Get low part ********************** -+//********************************************************** -+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0 -+{ -+ int16x4_t res64; -+ return64(a); -+} -+ -+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0 -+{ -+ int16x4_t res64; -+ return64(a); -+} -+ -+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0 -+{ -+ int32x2_t res64; -+ return64(a); -+} -+ -+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0 -+{ -+ int64x1_t res64; -+ return64 (a); -+} -+ -+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a) -+{ -+ float32x2_t res64; -+ _M64f(res64, a); -+ return res64; -+} -+ -+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 -+#define vget_low_u8 vget_low_s8 -+ -+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 -+#define vget_low_u16 vget_low_s16 -+ -+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 -+#define vget_low_u32 vget_low_s32 -+ -+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 -+#define vget_low_u64 vget_low_s64 -+ -+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 -+#define vget_low_p8 vget_low_u8 -+ -+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 -+#define vget_low_p16 vget_low_s16 -+ -+//************************************************************************** -+//************************ Converting vectors ********************************** -+//************************************************************************** -+//************* Convert from float *************************************** -+// need to set _MM_SET_ROUNDING_MODE ( x) accordingly -+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 -+_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only -+ return64(res); -+} -+ -+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a) -+{ -+ //may be not effective compared with a serial SIMD solution -+ uint32x2_t res64; -+ __m128i res; -+ res = vcvtq_u32_f32(_pM128(a)); -+ return64(res); -+} -+ -+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 -+#define vcvtq_s32_f32 _mm_cvttps_epi32 -+ -+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0 -+{ -+ //No single instruction SSE solution but we could implement it as following: -+ __m128i resi; -+ __m128 zero, mask, a_pos, mask_f_max_si, res; -+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; -+ zero = _mm_setzero_ps(); -+ mask = _mm_cmpgt_ps(a, zero); -+ a_pos = _mm_and_ps(a, mask); -+ mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff); -+ res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything -+ resi = _mm_cvttps_epi32(res); -+ return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si); -+} -+ -+// ***** Convert to the fixed point with the number of fraction bits specified by b *********** -+//************************************************************************************************* -+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 -+_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ return64(vcvtq_n_s32_f32(_pM128(a),b)); -+} -+ -+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 -+_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res; -+ float convconst; -+ convconst = (float)((uint32_t)1 << b); -+ res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst); -+ res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst); -+ return res; -+} -+ -+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 -+_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ __m128 cconst128; -+ __m128i mask, res; -+ convconst = (float)(1 << b); -+ cconst128 = vdupq_n_f32(convconst); -+ res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128)); -+ mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask); -+ return _mm_xor_si128 (res, mask); //res saturated for 0x80000000 -+} -+ -+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 -+_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ __m128 cconst128; -+ convconst = (float)(1 << b); -+ cconst128 = vdupq_n_f32(convconst); -+ return vcvtq_u32_f32(_mm_mul_ps(a,cconst128)); -+} -+ -+//***************** Convert to float ************************* -+//************************************************************* -+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 -+_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits -+{ -+ float32x2_t res; -+ res.m64_f32[0] = (float) a.m64_i32[0]; -+ res.m64_f32[1] = (float) a.m64_i32[1]; -+ return res; -+} -+ -+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 -+_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = (float) a.m64_u32[0]; -+ res.m64_f32[1] = (float) a.m64_u32[1]; -+ return res; -+} -+ -+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 -+#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a) -+ -+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 -+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0 -+{ -+ //solution may be not optimal -+ __m128 two16, fHi, fLo; -+ __m128i hi, lo; -+ two16 = _mm_set1_ps((float)0x10000); //2^16 -+ // Avoid double rounding by doing two exact conversions -+ // of high and low 16-bit segments -+ hi = _mm_srli_epi32(a, 16); -+ lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16); -+ fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16); -+ fLo = _mm_cvtepi32_ps(lo); -+ // do single rounding according to current rounding mode -+ return _mm_add_ps(fHi, fLo); -+} -+ -+// ***** Convert to the float from fixed point with the number of fraction bits specified by b *********** -+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 -+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b) -+{ -+ float32x2_t res; -+ float convconst; -+ convconst = (float)(1. / ((uint32_t)1 << b)); -+ res.m64_f32[0] = a.m64_i32[0] * convconst; -+ res.m64_f32[1] = a.m64_i32[1] * convconst; -+ return res; -+} -+ -+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 -+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32 -+{ -+ float32x2_t res; -+ float convconst; -+ convconst = (float)(1. / ((uint32_t)1 << b)); -+ res.m64_f32[0] = a.m64_u32[0] * convconst; -+ res.m64_f32[1] = a.m64_u32[1] * convconst; -+ return res; -+} -+ -+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 -+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ __m128 cconst128, af; -+ convconst = (float)(1. / ((uint32_t)1 << b)); -+ af = _mm_cvtepi32_ps(a); -+ cconst128 = vdupq_n_f32(convconst); -+ return _mm_mul_ps(af,cconst128); -+} -+ -+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 -+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ __m128 cconst128, af; -+ convconst = (float)(1. / (1 << b)); -+ af = vcvtq_f32_u32(a); -+ cconst128 = vdupq_n_f32(convconst); -+ return _mm_mul_ps(af,cconst128); -+} -+ -+//**************Convert between floats *********************** -+//************************************************************ -+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 -+//Intel SIMD doesn't support 16bits floats curently -+ -+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 -+//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits -+ -+//************Vector narrow integer conversion (truncation) ****************** -+//**************************************************************************** -+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 -+_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0 -+{ -+ int8x8_t res64; -+ __m128i res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only -+ return64(res); -+} -+ -+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 -+_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0 -+{ -+ int16x4_t res64; -+ __m128i res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; -+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only -+ return64(res); -+} -+ -+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 -+_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a) -+{ -+ //may be not effective compared with a serial implementation -+ int32x2_t res64; -+ __m128i res; -+ res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0) -+ return64(res); -+} -+ -+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 -+#define vmovn_u16 vmovn_s16 -+ -+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 -+#define vmovn_u32 vmovn_s32 -+ -+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 -+#define vmovn_u64 vmovn_s64 -+ -+//**************** Vector long move *********************** -+//*********************************************************** -+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 -+#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1 -+ -+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 -+#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1 -+ -+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 -+#define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1 -+ -+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 -+#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1 -+ -+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0 -+#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1 -+ -+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 -+#define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1 -+ -+//*************Vector saturating narrow integer***************** -+//************************************************************** -+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 -+_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _mm_packs_epi16(a, a); -+ return64(res); -+} -+ -+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 -+_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = _mm_packs_epi32(a, a); -+ return64(res); -+} -+ -+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution -+{ -+ int32x2_t res; -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX; -+ if(atmp[0]SINT_MAX) atmp[1] = SINT_MAX; -+ if(atmp[1]32 bits numbers -+ res_hi = _mm_or_si128(a, mask); -+ res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(res_hi); -+} -+//************* Vector saturating narrow integer signed->unsigned ************** -+//***************************************************************************** -+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 -+_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a) -+{ -+ uint8x8_t res64; -+ __m128i res; -+ res = _mm_packus_epi16(a, a); //use low 64bits only -+ return64(res); -+} -+ -+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 -+_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a) -+{ -+ uint16x4_t res64; -+ __m128i res; -+ res = _MM_PACKUS1_EPI32(a); //use low 64bits only -+ return64(res); -+} -+ -+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 -+_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a) -+{ -+ uint32x2_t res64; -+ __m128i res_hi,res_lo, zero, cmp; -+ zero = _mm_setzero_si128(); -+ res_hi = _mm_srli_epi64(a, 32); -+ cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero -+ res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 -+ cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive -+ res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff -+ res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(res_lo); -+} -+ -+// ******************************************************** -+// **************** Table look up ************************** -+// ******************************************************** -+//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values -+//in a table and generate a new vector. Indexes out of range return 0. -+//for Intel SIMD we need to set the MSB to 1 for zero return -+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ __m128i c7, maskgt, bmask, b128; -+ c7 = _mm_set1_epi8 (7); -+ b128 = _pM128i(b); -+ maskgt = _mm_cmpgt_epi8(b128,c7); -+ bmask = _mm_or_si128(b128,maskgt); -+ bmask = _mm_shuffle_epi8(_pM128i(a),bmask); -+ return64(bmask); -+} -+ -+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 -+#define vtbl1_s8 vtbl1_u8 -+ -+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+#define vtbl1_p8 vtbl1_u8 -+ -+//Special trick to avoid __declspec(align('8')) won't be aligned" error -+//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ __m128i c15, a01, maskgt15, bmask, b128; -+ c15 = _mm_set1_epi8 (15); -+ b128 = _pM128i(b); -+ maskgt15 = _mm_cmpgt_epi8(b128,c15); -+ bmask = _mm_or_si128(b128, maskgt15); -+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1])); -+ a01 = _mm_shuffle_epi8(a01, bmask); -+ return64(a01); -+} -+#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b) -+ -+//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+#define vtbl2_s8 vtbl2_u8 -+ -+//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+#define vtbl2_p8 vtbl2_u8 -+ -+//Special trick to avoid __declspec(align('16')) won't be aligned" error -+//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128; -+ c15 = _mm_set1_epi8 (15); -+ c23 = _mm_set1_epi8 (23); -+ b128 = _pM128i(b); -+ maskgt23 = _mm_cmpgt_epi8(b128,c23); -+ bmask = _mm_or_si128(b128, maskgt23); -+ maskgt15 = _mm_cmpgt_epi8(b128,c15); -+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); -+ sh0 = _mm_shuffle_epi8(a01, bmask); -+ sh1 = _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1 -+ return64(sh0); -+} -+#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b) -+ -+//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+#define vtbl3_s8 vtbl3_u8 -+ -+//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+#define vtbl3_p8 vtbl3_u8 -+ -+//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128; -+ c15 = _mm_set1_epi8 (15); -+ c31 = _mm_set1_epi8 (31); -+ b128 = _pM128i(b); -+ maskgt31 = _mm_cmpgt_epi8(b128,c31); -+ bmask = _mm_or_si128(b128, maskgt31); -+ maskgt15 = _mm_cmpgt_epi8(b128,c15); -+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); -+ a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3])); -+ sh0 = _mm_shuffle_epi8(a01, bmask); -+ sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1 -+ return64(sh0); -+} -+#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b) -+ -+//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+#define vtbl4_s8 vtbl4_u8 -+ -+//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+#define vtbl4_p8 vtbl4_u8 -+ -+//****************** Extended table look up intrinsics *************************** -+//********************************************************************************** -+//VTBX (Vector Table Extension) works in the same way as VTBL do, -+// except that indexes out of range leave the destination element unchanged. -+ -+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) -+{ -+ uint8x8_t res64; -+ __m128i c7, maskgt, sh, c128; -+ c7 = _mm_set1_epi8 (7); -+ c128 = _pM128i(c); -+ maskgt = _mm_cmpgt_epi8(c128,c7); -+ c7 = _mm_and_si128(maskgt,_pM128i(a)); -+ sh = _mm_shuffle_epi8(_pM128i(b),c128); -+ sh = _mm_andnot_si128(maskgt,sh); -+ sh = _mm_or_si128(sh,c7); -+ return64(sh); -+} -+ -+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 -+#define vtbx1_s8 vtbx1_u8 -+ -+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+#define vtbx1_p8 vtbx1_u8 -+ -+//Special trick to avoid __declspec(align('8')) won't be aligned" error -+//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c) -+{ -+ uint8x8_t res64; -+ __m128i c15, b01, maskgt15, sh, c128; -+ c15 = _mm_set1_epi8 (15); -+ c128 = _pM128i(c); -+ maskgt15 = _mm_cmpgt_epi8(c128, c15); -+ c15 = _mm_and_si128(maskgt15, _pM128i(a)); -+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1])); -+ sh = _mm_shuffle_epi8(b01, c128); -+ sh = _mm_andnot_si128(maskgt15, sh); -+ sh = _mm_or_si128(sh,c15); -+ return64(sh); -+} -+#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c) -+ -+//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+#define vtbx2_s8 vtbx2_u8 -+ -+//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+#define vtbx2_p8 vtbx2_u8 -+ -+//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128; -+ c15 = _mm_set1_epi8 (15); -+ c23 = _mm_set1_epi8 (23); -+ c128 = _pM128i(c); -+ maskgt15 = _mm_cmpgt_epi8(c128,c15); -+ maskgt23 = _mm_cmpgt_epi8(c128,c23); -+ c23 = _mm_and_si128(maskgt23, _pM128i(a)); -+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); -+ sh0 = _mm_shuffle_epi8(b01, c128); -+ sh1 = _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); -+ sh0 = _mm_andnot_si128(maskgt23,sh0); -+ sh0 = _mm_or_si128(sh0,c23); -+ return64(sh0); -+} -+#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c) -+ -+//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c); -+#define vtbx3_s8 vtbx3_u8 -+ -+//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c); -+#define vtbx3_p8 vtbx3_u8 -+ -+//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128; -+ c15 = _mm_set1_epi8 (15); -+ c31 = _mm_set1_epi8 (31); -+ c128 = _pM128i(c); -+ maskgt15 = _mm_cmpgt_epi8(c128,c15); -+ maskgt31 = _mm_cmpgt_epi8(c128,c31); -+ c31 = _mm_and_si128(maskgt31, _pM128i(a)); -+ -+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); -+ b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3])); -+ sh0 = _mm_shuffle_epi8(b01, c128); -+ sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); -+ sh0 = _mm_andnot_si128(maskgt31,sh0); -+ sh0 = _mm_or_si128(sh0,c31); -+ return64(sh0); -+} -+#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c) -+ -+//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c); -+#define vtbx4_s8 vtbx4_u8 -+ -+//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c); -+#define vtbx4_p8 vtbx4_u8 -+ -+//************************************************************************************************* -+// *************************** Operations with a scalar value ********************************* -+//************************************************************************************************* -+ -+//******* Vector multiply accumulate by scalar ************************************************* -+//********************************************************************************************** -+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0] -+{ -+ int16_t c; -+ int16x4_t scalar; -+ c = vget_lane_s16(v, l); -+ scalar = vdup_n_s16(c); -+ return vmla_s16(a, b, scalar); -+} -+ -+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0] -+{ -+ int32_t c; -+ int32x2_t scalar; -+ c = vget_lane_s32(v, l); -+ scalar = vdup_n_s32(c); -+ return vmla_s32(a, b, scalar); -+} -+ -+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] -+#define vmla_lane_u16 vmla_lane_s16 -+ -+ -+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] -+#define vmla_lane_u32 vmla_lane_s32 -+ -+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) -+{ -+ float32_t vlane; -+ float32x2_t c; -+ vlane = vget_lane_f32(v, l); -+ c = vdup_n_f32(vlane); -+ return vmla_f32(a,b,c); -+} -+ -+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] -+{ -+ int16_t vlane; -+ int16x8_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdupq_n_s16(vlane); -+ return vmlaq_s16(a,b,c); -+} -+ -+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] -+{ -+ int32_t vlane; -+ int32x4_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdupq_n_s32(vlane); -+ return vmlaq_s32(a,b,c); -+} -+ -+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] -+#define vmlaq_lane_u16 vmlaq_lane_s16 -+ -+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] -+#define vmlaq_lane_u32 vmlaq_lane_s32 -+ -+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] -+{ -+ float32_t vlane; -+ float32x4_t c; -+ vlane = vget_lane_f32(v, l); -+ c = vdupq_n_f32(vlane); -+ return vmlaq_f32(a,b,c); -+} -+ -+//***************** Vector widening multiply accumulate by scalar ********************** -+//*************************************************************************************** -+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmlal_s16(a, b, c); -+} -+ -+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vmlal_s32(a, b, c); -+} -+ -+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t c; -+ vlane = vget_lane_u16(v, l); -+ c = vdup_n_u16(vlane); -+ return vmlal_u16(a, b, c); -+} -+ -+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdup_n_u32(vlane); -+ return vmlal_u32(a, b, c); -+} -+ -+// ******** Vector widening saturating doubling multiply accumulate by scalar ******************************* -+// ************************************************************************************************ -+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vqdmlal_s16(a, b, c); -+} -+ -+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) -+{ -+ int32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vqdmlal_s32(a, b, c); -+} -+ -+// ****** Vector multiply subtract by scalar ***************** -+// ************************************************************* -+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmls_s16(a, b, c); -+} -+ -+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vmls_s32(a, b, c); -+} -+ -+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmls_s16(a, b, c); -+} -+ -+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdup_n_u32(vlane); -+ return vmls_u32(a, b, c); -+} -+ -+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) -+{ -+ float32_t vlane; -+ float32x2_t c; -+ vlane = (float) vget_lane_f32(v, l); -+ c = vdup_n_f32(vlane); -+ return vmls_f32(a,b,c); -+} -+ -+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0] -+{ -+ int16_t vlane; -+ int16x8_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdupq_n_s16(vlane); -+ return vmlsq_s16(a, b,c); -+} -+ -+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0] -+{ -+ int32_t vlane; -+ int32x4_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdupq_n_s32(vlane); -+ return vmlsq_s32(a,b,c); -+} -+ -+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x8_t c; -+ vlane = vget_lane_u16(v, l); -+ c = vdupq_n_u16(vlane); -+ return vmlsq_u16(a,b,c); -+} -+ -+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x4_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdupq_n_u32(vlane); -+ return vmlsq_u32(a,b,c); -+} -+ -+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] -+{ -+ float32_t vlane; -+ float32x4_t c; -+ vlane = (float) vget_lane_f32(v, l); -+ c = vdupq_n_f32(vlane); -+ return vmlsq_f32(a,b,c); -+} -+ -+// **** Vector widening multiply subtract by scalar **** -+// **************************************************** -+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmlsl_s16(a, b, c); -+} -+ -+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vmlsl_s32(a, b, c); -+} -+ -+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmlsl_s16(a, b, c); -+} -+ -+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdup_n_u32(vlane); -+ return vmlsl_u32(a, b, c); -+} -+ -+//********* Vector widening saturating doubling multiply subtract by scalar ************************** -+//****************************************************************************************************** -+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vqdmlsl_s16(a, b, c); -+} -+ -+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vqdmlsl_s32(a, b, c); -+} -+//********** Vector multiply with scalar ***************************** -+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0] -+{ -+ int16x4_t b16x4; -+ b16x4 = vdup_n_s16(b); -+ return vmul_s16(a, b16x4); -+} -+ -+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] -+_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0] -+{ -+ //serial solution looks faster -+ int32x2_t b32x2; -+ b32x2 = vdup_n_s32(b); -+ return vmul_s32(a, b32x2); -+} -+ -+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] -+_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0] -+{ -+ float32x2_t b32x2; -+ b32x2 = vdup_n_f32(b); -+ return vmul_f32(a, b32x2); -+} -+ -+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] -+_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0] -+{ -+ uint16x4_t b16x4; -+ b16x4 = vdup_n_s16(b); -+ return vmul_s16(a, b16x4); -+} -+ -+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] -+_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0] -+{ -+ //serial solution looks faster -+ uint32x2_t b32x2; -+ b32x2 = vdup_n_u32(b); -+ return vmul_u32(a, b32x2); -+} -+ -+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0] -+{ -+ int16x8_t b16x8; -+ b16x8 = vdupq_n_s16(b); -+ return vmulq_s16(a, b16x8); -+} -+ -+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] -+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0] -+{ -+ int32x4_t b32x4; -+ b32x4 = vdupq_n_s32(b); -+ return vmulq_s32(a, b32x4); -+} -+ -+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] -+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0] -+{ -+ float32x4_t b32x4; -+ b32x4 = vdupq_n_f32(b); -+ return vmulq_f32(a, b32x4); -+} -+ -+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] -+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0] -+{ -+ uint16x8_t b16x8; -+ b16x8 = vdupq_n_s16(b); -+ return vmulq_s16(a, b16x8); -+} -+ -+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] -+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0] -+{ -+ uint32x4_t b32x4; -+ b32x4 = vdupq_n_u32(b); -+ return vmulq_u32(a, b32x4); -+} -+ -+//********** Vector multiply lane ***************************** -+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); -+_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c) -+{ -+ int16x4_t b16x4; -+ int16_t vlane; -+ vlane = vget_lane_s16(b, c); -+ b16x4 = vdup_n_s16(vlane); -+ return vmul_s16(a, b16x4); -+} -+ -+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c) -+{ -+ int32x2_t b32x2; -+ int32_t vlane; -+ vlane = vget_lane_s32(b, c); -+ b32x2 = vdup_n_s32(vlane); -+ return vmul_s32(a, b32x2); -+} -+ -+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c) -+{ -+ float32x2_t b32x2; -+ float32_t vlane; -+ vlane = vget_lane_f32(b, c); -+ b32x2 = vdup_n_f32(vlane); -+ return vmul_f32(a, b32x2); -+} -+ -+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); -+#define vmul_lane_u16 vmul_lane_s16 -+ -+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); -+#define vmul_lane_u32 vmul_lane_s32 -+ -+int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c); -+_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c) -+{ -+ int16x8_t b16x8; -+ int16_t vlane; -+ vlane = vget_lane_s16(b, c); -+ b16x8 = vdupq_n_s16(vlane); -+ return vmulq_s16(a, b16x8); -+} -+ -+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c) -+{ -+ int32x4_t b32x4; -+ int32_t vlane; -+ vlane = vget_lane_s32(b, c); -+ b32x4 = vdupq_n_s32(vlane); -+ return vmulq_s32(a, b32x4); -+} -+ -+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c) -+{ -+ float32x4_t b32x4; -+ float32_t vlane; -+ vlane = vget_lane_f32(b, c); -+ b32x4 = vdupq_n_f32(vlane); -+ return vmulq_f32(a, b32x4); -+} -+ -+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); -+#define vmulq_lane_u16 vmulq_lane_s16 -+ -+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); -+#define vmulq_lane_u32 vmulq_lane_s32 -+ -+//**** Vector long multiply with scalar ************ -+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0] -+{ -+ int16x4_t b16x4; -+ b16x4 = vdup_n_s16(val2); -+ return vmull_s16(vec1, b16x4); -+} -+ -+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0] -+{ -+ int32x2_t b32x2; -+ b32x2 = vdup_n_s32(val2); -+ return vmull_s32(vec1, b32x2); -+} -+ -+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0] -+_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0] -+{ -+ uint16x4_t b16x4; -+ b16x4 = vdup_n_s16(val2); -+ return vmull_s16(vec1, b16x4); -+} -+ -+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] -+_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0] -+{ -+ uint32x2_t b32x2; -+ b32x2 = vdup_n_u32(val2); -+ return vmull_u32(vec1, b32x2); -+} -+ -+//**** Vector long multiply by scalar **** -+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0] -+{ -+ int16_t vlane; -+ int16x4_t b; -+ vlane = vget_lane_s16(val2, val3); -+ b = vdup_n_s16(vlane); -+ return vmull_s16(vec1, b); -+} -+ -+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0] -+{ -+ int32_t vlane; -+ int32x2_t b; -+ vlane = vget_lane_s32(val2, val3); -+ b = vdup_n_s32(vlane); -+ return vmull_s32(vec1, b); -+} -+ -+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0] -+_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t b; -+ vlane = vget_lane_s16(val2, val3); -+ b = vdup_n_s16(vlane); -+ return vmull_s16(vec1, b); -+} -+ -+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] -+_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t b; -+ vlane = vget_lane_u32(val2, val3); -+ b = vdup_n_u32(vlane); -+ return vmull_u32(vec1, b); -+} -+ -+//********* Vector saturating doubling long multiply with scalar ******************* -+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2) -+{ -+ //the serial soulution may be faster due to saturation -+ int16x4_t b; -+ b = vdup_n_s16(val2); -+ return vqdmull_s16(vec1, b); -+} -+ -+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t b; -+ b = vdup_n_s32(val2); -+ return vqdmull_s32(vec1,b); //slow serial function!!!! -+} -+ -+//************* Vector saturating doubling long multiply by scalar *********************************************** -+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) -+{ -+ int16_t c; -+ int16x4_t scalar; -+ c = vget_lane_s16(val2, val3); -+ scalar = vdup_n_s16(c); -+ return vqdmull_s16(vec1, scalar); -+} -+ -+ -+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32_t c; -+ int32x2_t scalar; -+ c = vget_lane_s32(val2, val3); -+ scalar = vdup_n_s32(c); -+ return vqdmull_s32(vec1,scalar); //slow serial function!!!! -+} -+ -+// *****Vector saturating doubling multiply high with scalar ***** -+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2) -+{ -+ int16x4_t res64; -+ return64(vqdmulhq_n_s16(_pM128i(vec1), val2)); -+} -+ -+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2) -+{ -+ int32x2_t res64; -+ return64(vqdmulhq_n_s32(_pM128i(vec1), val2)); -+} -+ -+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16x8_t scalar; -+ scalar = vdupq_n_s16(val2); -+ return vqdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32x4_t scalar; -+ scalar = vdupq_n_s32(val2); -+ return vqdmulhq_s32(vec1, scalar); -+} -+ -+//***** Vector saturating doubling multiply high by scalar **************** -+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x4_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdup_n_s16(vlane); -+ return vqdmulh_s16(vec1, scalar); -+} -+ -+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32_t vlane; -+ int32x2_t scalar; -+ vlane = vget_lane_s32(val2, val3); -+ scalar = vdup_n_s32(vlane); -+ return vqdmulh_s32(vec1, scalar); -+} -+ -+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x8_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdupq_n_s16(vlane ); -+ return vqdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //solution may be not optimal -+ int32_t vlane; -+ int32x4_t scalar; -+ vlane = vgetq_lane_s32(_pM128i(val2), val3); -+ scalar = vdupq_n_s32(vlane ); -+ return vqdmulhq_s32(vec1, scalar); -+} -+ -+//******** Vector saturating rounding doubling multiply high with scalar *** -+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0] -+{ -+ //solution may be not optimal -+ int16x4_t scalar; -+ scalar = vdup_n_s16(val2); -+ return vqrdmulh_s16(vec1, scalar); -+} -+ -+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32x2_t scalar; -+ scalar = vdup_n_s32(val2); -+ return vqrdmulh_s32(vec1, scalar); -+} -+ -+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16x8_t scalar; -+ scalar = vdupq_n_s16(val2); -+ return vqrdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32x4_t scalar; -+ scalar = vdupq_n_s32(val2); -+ return vqrdmulhq_s32(vec1, scalar); -+} -+ -+//********* Vector rounding saturating doubling multiply high by scalar **** -+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x4_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdup_n_s16(vlane); -+ return vqrdmulh_s16(vec1, scalar); -+} -+ -+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32_t vlane; -+ int32x2_t scalar; -+ vlane = vget_lane_s32(val2, val3); -+ scalar = vdup_n_s32(vlane); -+ return vqrdmulh_s32(vec1, scalar); -+} -+ -+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x8_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdupq_n_s16(vlane); -+ return vqrdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //solution may be not optimal -+ int32_t vlane; -+ int32x4_t scalar; -+ vlane = vgetq_lane_s32(_pM128i(val2), val3); -+ scalar = vdupq_n_s32(vlane ); -+ return vqrdmulhq_s32(vec1, scalar); -+} -+ -+//**************Vector multiply accumulate with scalar ******************* -+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0] -+{ -+ int16x4_t scalar; -+ scalar = vdup_n_s16(c); -+ return vmla_s16(a, b, scalar); -+} -+ -+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0] -+{ -+ int32x2_t scalar; -+ scalar = vdup_n_s32(c); -+ return vmla_s32(a, b, scalar); -+} -+ -+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] -+#define vmla_n_u16 vmla_n_s16 -+ -+ -+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] -+#define vmla_n_u32 vmla_n_s32 -+ -+ -+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0] -+{ -+ float32x2_t scalar; -+ scalar = vdup_n_f32(c); -+ return vmla_f32(a, b, scalar); -+} -+ -+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0] -+{ -+ int16x8_t scalar; -+ scalar = vdupq_n_s16(c); -+ return vmlaq_s16(a,b,scalar); -+} -+ -+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0] -+{ -+ int32x4_t scalar; -+ scalar = vdupq_n_s32(c); -+ return vmlaq_s32(a,b,scalar); -+} -+ -+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] -+#define vmlaq_n_u16 vmlaq_n_s16 -+ -+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] -+#define vmlaq_n_u32 vmlaq_n_s32 -+ -+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0] -+{ -+ float32x4_t scalar; -+ scalar = vdupq_n_f32(c); -+ return vmlaq_f32(a,b,scalar); -+} -+ -+//************Vector widening multiply accumulate with scalar**************************** -+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0] -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmlal_s16(a, b, vc); -+} -+ -+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0] -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vmlal_s32(a, b, vc); -+} -+ -+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0] -+{ -+ uint16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmlal_s16(a, b, vc); -+} -+ -+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0] -+{ -+ uint32x2_t vc; -+ vc = vdup_n_u32(c); -+ return vmlal_u32(a, b, vc); -+} -+ -+//************ Vector widening saturating doubling multiply accumulate with scalar ************** -+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) -+{ -+ //not optimal SIMD soulution, serial may be faster -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vqdmlal_s16(a, b, vc); -+} -+ -+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vqdmlal_s32(a, b, vc); -+} -+ -+//******** Vector multiply subtract with scalar ************** -+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0] -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmls_s16(a, b, vc); -+} -+ -+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0] -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vmls_s32(a, b, vc); -+} -+ -+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0] -+{ -+ uint16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmls_s16(a, b, vc); -+} -+ -+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0] -+{ -+ uint32x2_t vc; -+ vc = vdup_n_u32(c); -+ return vmls_u32(a, b, vc); -+} -+ -+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c; -+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c; -+ return res; -+} -+ -+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0] -+{ -+ int16x8_t vc; -+ vc = vdupq_n_s16(c); -+ return vmlsq_s16(a, b,vc); -+} -+ -+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0] -+{ -+ int32x4_t vc; -+ vc = vdupq_n_s32(c); -+ return vmlsq_s32(a,b,vc); -+} -+ -+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0] -+{ -+ uint32x4_t vc; -+ vc = vdupq_n_u32(c); -+ return vmlsq_u32(a,b,vc); -+} -+ -+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0] -+{ -+ uint32x4_t vc; -+ vc = vdupq_n_u32(c); -+ return vmlsq_u32(a,b,vc); -+} -+ -+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) -+{ -+ float32x4_t vc; -+ vc = vdupq_n_f32(c); -+ return vmlsq_f32(a,b,vc); -+} -+ -+//**** Vector widening multiply subtract with scalar ****** -+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0] -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmlsl_s16(a, b, vc); -+} -+ -+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0] -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vmlsl_s32(a, b, vc); -+} -+ -+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0] -+{ -+ uint16x4_t vc; -+ vc = vdup_n_u16(c); -+ return vmlsl_u16(a, b, vc); -+} -+ -+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0] -+{ -+ uint32x2_t vc; -+ vc = vdup_n_u32(c); -+ return vmlsl_u32(a, b, vc); -+} -+ -+//***** Vector widening saturating doubling multiply subtract with scalar ********* -+//********************************************************************************** -+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vqdmlsl_s16(a, b, vc); -+} -+ -+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vqdmlsl_s32(a, b, vc); -+} -+ -+//******************* Vector extract *********************************************** -+//************************************************************************************* -+//VEXT (Vector Extract) extracts elements from the bottom end of the second operand -+//vector and the top end of the first, concatenates them, and places the result in the destination vector -+//c elements from the bottom end of the second operand and (8-c) from the top end of the first -+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int8x8_t res; -+ int i; -+ for (i = 0; i<8 - c; i++) { -+ res.m64_i8[i] = a.m64_i8[i + c]; -+ } -+ for(i = 0; i> 1); -+ tmp = _mm_srli_epi32(res, 2); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2); -+ tmp = _mm_srli_epi32(res, 4); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4); -+ tmp = _mm_srli_epi32(res, 8); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8); -+ tmp = _mm_srli_epi32(res, 16); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16); -+ -+ tmp = _mm_srli_epi32(res, 1); -+ tmp = _mm_and_si128(tmp, c55555555); -+ res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555); -+ -+ tmp = _mm_srli_epi32(res, 2); -+ tmp = _mm_and_si128(tmp, c33333333); -+ tmp1 = _mm_and_si128(res, c33333333); -+ res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333)); -+ -+ tmp = _mm_srli_epi32(res, 4); -+ tmp = _mm_add_epi32(tmp, res); -+ res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f); -+ -+ tmp = _mm_srli_epi32(res, 8); -+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8); -+ -+ tmp = _mm_srli_epi32(res, 16); -+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16); -+ -+ res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f; -+ -+ return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i]; -+} -+ -+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 -+#define vclzq_u8 vclzq_s8 -+ -+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 -+#define vclzq_u16 vclzq_s16 -+ -+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 -+#define vclzq_u32 vclzq_s32 -+ -+//************** Count leading sign bits ************************** -+//******************************************************************** -+//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following -+// the topmost bit, that are the same as the topmost bit, in each element in a vector -+//No corresponding vector intrinsics in IA32, need to implement it. -+//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits -+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 -+_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = vclsq_s8(_pM128i(a)); -+ return64(res); -+} -+ -+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 -+_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = vclsq_s16(_pM128i(a)); -+ return64(res); -+} -+ -+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 -+_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = vclsq_s32(_pM128i(a)); -+ return64(res); -+} -+ -+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 -+_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a) -+{ -+ __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb; -+ cff = _mm_cmpeq_epi8 (a,a); //0xff -+ c80 = _mm_set1_epi8(0x80); -+ c1 = _mm_set1_epi8(1); -+ a_mask = _mm_and_si128(a, c80); -+ a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive -+ a_neg = _mm_xor_si128(a, cff); -+ a_neg = _mm_and_si128(a_mask, a_neg); -+ a_pos = _mm_andnot_si128(a_mask, a); -+ a_comb = _mm_or_si128(a_pos, a_neg); -+ a_comb = vclzq_s8(a_comb); -+ return _mm_sub_epi8(a_comb, c1); -+} -+ -+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 -+_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a) -+{ -+ __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb; -+ cffff = _mm_cmpeq_epi16(a,a); -+ c8000 = _mm_slli_epi16(cffff, 15); //0x8000 -+ c1 = _mm_srli_epi16(cffff,15); //0x1 -+ a_mask = _mm_and_si128(a, c8000); -+ a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive -+ a_neg = _mm_xor_si128(a, cffff); -+ a_neg = _mm_and_si128(a_mask, a_neg); -+ a_pos = _mm_andnot_si128(a_mask, a); -+ a_comb = _mm_or_si128(a_pos, a_neg); -+ a_comb = vclzq_s16(a_comb); -+ return _mm_sub_epi16(a_comb, c1); -+} -+ -+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 -+_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a) -+{ -+ __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb; -+ cffffffff = _mm_cmpeq_epi32(a,a); -+ c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000 -+ c1 = _mm_srli_epi32(cffffffff,31); //0x1 -+ a_mask = _mm_and_si128(a, c80000000); -+ a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive -+ a_neg = _mm_xor_si128(a, cffffffff); -+ a_neg = _mm_and_si128(a_mask, a_neg); -+ a_pos = _mm_andnot_si128(a_mask, a); -+ a_comb = _mm_or_si128(a_pos, a_neg); -+ a_comb = vclzq_s32(a_comb); -+ return _mm_sub_epi32(a_comb, c1); -+} -+ -+//************************* Count number of set bits ******************************** -+//************************************************************************************* -+//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element -+//another option is to do the following algorithm: -+ -+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 -+_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a) -+{ -+ uint8x8_t res64; -+ __m128i res; -+ res = vcntq_u8(_pM128i(a)); -+ return64(res); -+} -+ -+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 -+#define vcnt_s8 vcnt_u8 -+ -+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 -+#define vcnt_p8 vcnt_u8 -+ -+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 -+_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a) -+{ -+ _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2, -+ /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3, -+ /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3, -+ /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 }; -+ __m128i maskLOW, mask, lowpopcnt, hipopcnt; -+ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set -+ mask = _mm_and_si128(a, maskLOW); -+ lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway -+ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits -+ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set -+ hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway -+ return _mm_add_epi8(lowpopcnt, hipopcnt); -+} -+ -+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 -+#define vcntq_s8 vcntq_u8 -+ -+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 -+#define vcntq_p8 vcntq_u8 -+ -+//************************************************************************************** -+//*********************** Logical operations **************************************** -+//************************************************************************************** -+//************************** Bitwise not *********************************** -+//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance -+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 -+_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = vmvnq_s8(_pM128i(a)); -+ return64(res); -+} -+ -+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 -+_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = vmvnq_s16(_pM128i(a)); -+ return64(res); -+} -+ -+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 -+_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = vmvnq_s32(_pM128i(a)); -+ return64(res); -+} -+ -+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 -+#define vmvn_u8 vmvn_s8 -+ -+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 -+#define vmvn_u16 vmvn_s16 -+ -+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 -+#define vmvn_u32 vmvn_s32 -+ -+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 -+#define vmvn_p8 vmvn_u8 -+ -+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 -+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0 -+{ -+ __m128i c1; -+ c1 = _mm_cmpeq_epi8 (a,a); //0xff -+ return _mm_andnot_si128 (a, c1); -+} -+ -+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 -+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0 -+{ -+ __m128i c1; -+ c1 = _mm_cmpeq_epi16 (a,a); //0xffff -+ return _mm_andnot_si128 (a, c1); -+} -+ -+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 -+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0 -+{ -+ __m128i c1; -+ c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff -+ return _mm_andnot_si128 (a, c1); -+} -+ -+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 -+#define vmvnq_u8 vmvnq_s8 -+ -+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 -+#define vmvnq_u16 vmvnq_s16 -+ -+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 -+#define vmvnq_u32 vmvnq_s32 -+ -+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 -+#define vmvnq_p8 vmvnq_u8 -+ -+//****************** Bitwise and *********************** -+//****************************************************** -+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); -+} -+ -+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); -+} -+ -+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0]; -+ return res; -+} -+ -+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 -+#define vand_u8 vand_s8 -+ -+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 -+#define vand_u16 vand_s16 -+ -+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 -+#define vand_u32 vand_s32 -+ -+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 -+#define vand_u64 vand_s64 -+ -+ -+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 -+#define vandq_s8 _mm_and_si128 -+ -+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 -+#define vandq_s16 _mm_and_si128 -+ -+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 -+#define vandq_s32 _mm_and_si128 -+ -+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 -+#define vandq_s64 _mm_and_si128 -+ -+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 -+#define vandq_u8 _mm_and_si128 -+ -+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 -+#define vandq_u16 _mm_and_si128 -+ -+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 -+#define vandq_u32 _mm_and_si128 -+ -+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 -+#define vandq_u64 _mm_and_si128 -+ -+//******************** Bitwise or ********************************* -+//****************************************************************** -+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0]; -+ return res; -+} -+ -+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 -+#define vorr_u8 vorr_s8 -+ -+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 -+#define vorr_u16 vorr_s16 -+ -+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 -+#define vorr_u32 vorr_s32 -+ -+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 -+#define vorr_u64 vorr_s64 -+ -+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 -+#define vorrq_s8 _mm_or_si128 -+ -+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 -+#define vorrq_s16 _mm_or_si128 -+ -+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 -+#define vorrq_s32 _mm_or_si128 -+ -+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 -+#define vorrq_s64 _mm_or_si128 -+ -+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 -+#define vorrq_u8 _mm_or_si128 -+ -+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 -+#define vorrq_u16 _mm_or_si128 -+ -+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 -+#define vorrq_u32 _mm_or_si128 -+ -+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 -+#define vorrq_u64 _mm_or_si128 -+ -+//************* Bitwise exclusive or (EOR or XOR) ****************** -+//******************************************************************* -+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_xor_si128(_pM128i(a),_pM128i(b))); -+} -+ -+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 -+#define veor_s16 veor_s8 -+ -+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 -+#define veor_s32 veor_s8 -+ -+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0]; -+ return res; -+} -+ -+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 -+#define veor_u8 veor_s8 -+ -+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 -+#define veor_u16 veor_s16 -+ -+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 -+#define veor_u32 veor_s32 -+ -+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 -+#define veor_u64 veor_s64 -+ -+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 -+#define veorq_s8 _mm_xor_si128 -+ -+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 -+#define veorq_s16 _mm_xor_si128 -+ -+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 -+#define veorq_s32 _mm_xor_si128 -+ -+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 -+#define veorq_s64 _mm_xor_si128 -+ -+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 -+#define veorq_u8 _mm_xor_si128 -+ -+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 -+#define veorq_u16 _mm_xor_si128 -+ -+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 -+#define veorq_u32 _mm_xor_si128 -+ -+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 -+#define veorq_u64 _mm_xor_si128 -+ -+//********************** Bit Clear ********************************** -+//******************************************************************* -+//Logical AND complement (AND negation or AND NOT) -+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap" -+} -+ -+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 -+#define vbic_s16 vbic_s8 -+ -+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 -+#define vbic_s32 vbic_s8 -+ -+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]); -+ return res; -+} -+ -+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 -+#define vbic_u8 vbic_s8 -+ -+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 -+#define vbic_u16 vbic_s16 -+ -+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 -+#define vbic_u32 vbic_s32 -+ -+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 -+#define vbic_u64 vbic_s64 -+ -+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 -+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 -+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 -+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 -+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 -+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 -+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 -+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 -+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+//**************** Bitwise OR complement ******************************** -+//**************************************** ******************************** -+//no exact IA 32 match, need to implement it as following -+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vornq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vornq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vornq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]); -+ return res; -+} -+ -+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 -+#define vorn_u8 vorn_s8 -+ -+ -+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 -+#define vorn_u16 vorn_s16 -+ -+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 -+#define vorn_u32 vorn_s32 -+ -+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 -+#define vorn_u64 vorn_s64 -+ -+ -+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s8( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s16( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s32( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b) -+{ -+ __m128i c1, b1; -+ c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff -+ b1 = _mm_andnot_si128 (b, c1); -+ return _mm_or_si128 (a, b1); -+} -+ -+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_u8( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s16( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_u32( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 -+#define vornq_u64 vornq_s64 -+ -+//********************* Bitwise Select ***************************** -+//****************************************************************** -+//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????) -+ -+//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the -+//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0. -+ -+//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination -+//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged -+ -+//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination -+//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged. -+ -+//VBSL only is implemented for SIMD -+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c)); -+ return64(res); -+} -+ -+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 -+#define vbsl_s16 vbsl_s8 -+ -+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 -+#define vbsl_s32 vbsl_s8 -+ -+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]); -+ return res; -+} -+ -+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 -+#define vbsl_u8 vbsl_s8 -+ -+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 -+#define vbsl_u16 vbsl_s8 -+ -+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 -+#define vbsl_u32 vbsl_s8 -+ -+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 -+#define vbsl_u64 vbsl_s64 -+ -+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) -+{ -+ __m128 sel1, sel2; -+ __m64_128 res64; -+ sel1 = _mm_and_ps (_pM128(a), _pM128(b)); -+ sel2 = _mm_andnot_ps (_pM128(a), _pM128(c)); -+ sel1 = _mm_or_ps (sel1, sel2); -+ _M64f(res64, sel1); -+ return res64; -+} -+ -+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 -+#define vbsl_p8 vbsl_s8 -+ -+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 -+#define vbsl_p16 vbsl_s8 -+ -+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0 -+{ -+ __m128i sel1, sel2; -+ sel1 = _mm_and_si128 (a, b); -+ sel2 = _mm_andnot_si128 (a, c); -+ return _mm_or_si128 (sel1, sel2); -+} -+ -+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 -+#define vbslq_s16 vbslq_s8 -+ -+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 -+#define vbslq_s32 vbslq_s8 -+ -+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 -+#define vbslq_s64 vbslq_s8 -+ -+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 -+#define vbslq_u8 vbslq_s8 -+ -+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 -+#define vbslq_u16 vbslq_s8 -+ -+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 -+#define vbslq_u32 vbslq_s8 -+ -+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 -+#define vbslq_u64 vbslq_s8 -+ -+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0 -+{ -+ __m128 sel1, sel2; -+ sel1 = _mm_and_ps (*(__m128*)&a, b); -+ sel2 = _mm_andnot_ps (*(__m128*)&a, c); -+ return _mm_or_ps (sel1, sel2); -+} -+ -+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 -+#define vbslq_p8 vbslq_u8 -+ -+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 -+#define vbslq_p16 vbslq_s8 -+ -+//************************************************************************************ -+//**************** Transposition operations **************************************** -+//************************************************************************************ -+//***************** Vector Transpose ************************************************ -+//************************************************************************************ -+//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices. -+// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....) -+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 -+_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0 -+{ -+ int8x8x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp -+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7) -+ vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6), -+ return val; -+} -+ -+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 -+_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0 -+{ -+ int16x4x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15}; -+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3 -+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2), -+ return val; -+} -+ -+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 -+_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1 -+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0, -+ return val; -+} -+ -+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 -+#define vtrn_u8 vtrn_s8 -+ -+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 -+#define vtrn_u16 vtrn_s16 -+ -+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 -+#define vtrn_u32 vtrn_s32 -+ -+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 -+_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x2x2_t val; -+ val.val[0].m64_f32[0] = a.m64_f32[0]; -+ val.val[0].m64_f32[1] = b.m64_f32[0]; -+ val.val[1].m64_f32[0] = a.m64_f32[1]; -+ val.val[1].m64_f32[1] = b.m64_f32[1]; -+ return val; //a0,b0,a1,b1 -+} -+ -+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 -+#define vtrn_p8 vtrn_u8 -+ -+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 -+#define vtrn_p16 vtrn_s16 -+ -+//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 -+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0 -+{ -+ int8x16x2_t r8x16; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 -+ -+ r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14) -+ r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15) -+ return r8x16; -+} -+ -+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 -+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0 -+{ -+ int16x8x2_t v16x8; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 -+ v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6 -+ v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7 -+ return v16x8; -+} -+ -+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 -+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0 -+{ -+ //may be not optimal solution compared with serial -+ int32x4x2_t v32x4; -+ __m128i a_sh, b_sh; -+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 -+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 -+ -+ v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2 -+ v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3 -+ return v32x4; -+} -+ -+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 -+#define vtrnq_u8 vtrnq_s8 -+ -+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 -+#define vtrnq_u16 vtrnq_s16 -+ -+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 -+#define vtrnq_u32 vtrnq_s32 -+ -+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 -+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0 -+{ -+ //may be not optimal solution compared with serial -+ float32x4x2_t f32x4; -+ __m128 a_sh, b_sh; -+ a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness -+ b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness -+ -+ f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2 -+ f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3 -+ return f32x4; -+} -+ -+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 -+#define vtrnq_p8 vtrnq_s8 -+ -+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 -+#define vtrnq_p16 vtrnq_s16 -+ -+//***************** Interleave elements *************************** -+//***************************************************************** -+//output has (a0,b0,a1,b1, a2,b2,.....) -+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 -+_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0 -+{ -+ int8x8x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); -+ vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 -+_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0 -+{ -+ int16x4x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); -+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 -+#define vzip_s32 vtrn_s32 -+ -+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 -+#define vzip_u8 vzip_s8 -+ -+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 -+#define vzip_u16 vzip_s16 -+ -+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 -+#define vzip_u32 vzip_s32 -+ -+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 -+#define vzip_f32 vtrn_f32 -+ -+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 -+#define vzip_p8 vzip_u8 -+ -+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 -+#define vzip_p16 vzip_u16 -+ -+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 -+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0 -+{ -+ int8x16x2_t r8x16; -+ r8x16.val[0] = _mm_unpacklo_epi8(a, b); -+ r8x16.val[1] = _mm_unpackhi_epi8(a, b); -+ return r8x16; -+} -+ -+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 -+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0 -+{ -+ int16x8x2_t r16x8; -+ r16x8.val[0] = _mm_unpacklo_epi16(a, b); -+ r16x8.val[1] = _mm_unpackhi_epi16(a, b); -+ return r16x8; -+} -+ -+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 -+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0 -+{ -+ int32x4x2_t r32x4; -+ r32x4.val[0] = _mm_unpacklo_epi32(a, b); -+ r32x4.val[1] = _mm_unpackhi_epi32(a, b); -+ return r32x4; -+} -+ -+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 -+#define vzipq_u8 vzipq_s8 -+ -+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 -+#define vzipq_u16 vzipq_s16 -+ -+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 -+#define vzipq_u32 vzipq_s32 -+ -+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 -+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0 -+{ -+ float32x4x2_t f32x4; -+ f32x4.val[0] = _mm_unpacklo_ps ( a, b); -+ f32x4.val[1] = _mm_unpackhi_ps ( a, b); -+ return f32x4; -+} -+ -+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 -+#define vzipq_p8 vzipq_u8 -+ -+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 -+#define vzipq_p16 vzipq_u16 -+ -+//*********************** De-Interleave elements ************************* -+//************************************************************************* -+//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...) -+//no such functions in IA32 SIMD, shuffle is required -+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 -+_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0 -+{ -+ int8x8x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15}; -+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7) -+ vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 -+_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0 -+{ -+ int16x4x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; -+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3 -+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 -+_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0 -+{ -+ int32x2x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1 -+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 -+#define vuzp_u8 vuzp_s8 -+ -+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 -+#define vuzp_u16 vuzp_s16 -+ -+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 -+#define vuzp_u32 vuzp_s32 -+ -+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 -+#define vuzp_f32 vzip_f32 -+ -+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 -+#define vuzp_p8 vuzp_u8 -+ -+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 -+#define vuzp_p16 vuzp_u16 -+ -+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 -+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0 -+{ -+ int8x16x2_t v8x16; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 -+ //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b -+ v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14, -+ v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15 -+ return v8x16; -+} -+ -+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 -+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0 -+{ -+ int16x8x2_t v16x8; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 -+ v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6 -+ v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7 -+ return v16x8; -+} -+ -+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 -+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0 -+{ -+ //may be not optimal solution compared with serial -+ int32x4x2_t v32x4; -+ __m128i a_sh, b_sh; -+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 -+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 -+ -+ v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2 -+ v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3 -+ return v32x4; -+} -+ -+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 -+#define vuzpq_u8 vuzpq_s8 -+ -+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 -+#define vuzpq_u16 vuzpq_s16 -+ -+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 -+#define vuzpq_u32 vuzpq_s32 -+ -+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 -+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0 -+{ -+ float32x4x2_t v32x4; -+ v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however -+ v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however -+ return v32x4; -+} -+ -+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 -+#define vuzpq_p8 vuzpq_u8 -+ -+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 -+#define vuzpq_p16 vuzpq_u16 -+ -+//############################################################################################## -+//*********************** Reinterpret cast intrinsics.****************************************** -+//############################################################################################## -+// Not a part of oficial NEON instruction set but available in gcc compiler ********************* -+poly8x8_t vreinterpret_p8_u32 (uint32x2_t t); -+#define vreinterpret_p8_u32 -+ -+poly8x8_t vreinterpret_p8_u16 (uint16x4_t t); -+#define vreinterpret_p8_u16 -+ -+poly8x8_t vreinterpret_p8_u8 (uint8x8_t t); -+#define vreinterpret_p8_u8 -+ -+poly8x8_t vreinterpret_p8_s32 (int32x2_t t); -+#define vreinterpret_p8_s32 -+ -+poly8x8_t vreinterpret_p8_s16 (int16x4_t t); -+#define vreinterpret_p8_s16 -+ -+poly8x8_t vreinterpret_p8_s8 (int8x8_t t); -+#define vreinterpret_p8_s8 -+ -+poly8x8_t vreinterpret_p8_u64 (uint64x1_t t); -+#define vreinterpret_p8_u64 -+ -+poly8x8_t vreinterpret_p8_s64 (int64x1_t t); -+#define vreinterpret_p8_s64 -+ -+poly8x8_t vreinterpret_p8_f32 (float32x2_t t); -+#define vreinterpret_p8_f32 -+ -+poly8x8_t vreinterpret_p8_p16 (poly16x4_t t); -+#define vreinterpret_p8_p16 -+ -+poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t); -+#define vreinterpretq_p8_u32 -+ -+poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t); -+#define vreinterpretq_p8_u16 -+ -+poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t); -+#define vreinterpretq_p8_u8 -+ -+poly8x16_t vreinterpretq_p8_s32 (int32x4_t t); -+#define vreinterpretq_p8_s32 -+ -+poly8x16_t vreinterpretq_p8_s16 (int16x8_t t); -+#define vreinterpretq_p8_s16 -+ -+poly8x16_t vreinterpretq_p8_s8 (int8x16_t t); -+#define vreinterpretq_p8_s8 -+ -+poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t); -+#define vreinterpretq_p8_u64 -+ -+poly8x16_t vreinterpretq_p8_s64 (int64x2_t t); -+#define vreinterpretq_p8_s64 -+ -+poly8x16_t vreinterpretq_p8_f32 (float32x4_t t); -+#define vreinterpretq_p8_f32(t) _M128i(t) -+ -+poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t); -+#define vreinterpretq_p8_p16 -+ -+poly16x4_t vreinterpret_p16_u32 (uint32x2_t t); -+#define vreinterpret_p16_u32 -+ -+poly16x4_t vreinterpret_p16_u16 (uint16x4_t t); -+#define vreinterpret_p16_u16 -+ -+poly16x4_t vreinterpret_p16_u8 (uint8x8_t t); -+#define vreinterpret_p16_u8 -+ -+poly16x4_t vreinterpret_p16_s32 (int32x2_t t); -+#define vreinterpret_p16_s32 -+ -+poly16x4_t vreinterpret_p16_s16 (int16x4_t t); -+#define vreinterpret_p16_s16 -+ -+poly16x4_t vreinterpret_p16_s8 (int8x8_t t); -+#define vreinterpret_p16_s8 -+ -+poly16x4_t vreinterpret_p16_u64 (uint64x1_t t); -+#define vreinterpret_p16_u64 -+ -+poly16x4_t vreinterpret_p16_s64 (int64x1_t t); -+#define vreinterpret_p16_s64 -+ -+poly16x4_t vreinterpret_p16_f32 (float32x2_t t); -+#define vreinterpret_p16_f32 -+ -+poly16x4_t vreinterpret_p16_p8 (poly8x8_t t); -+#define vreinterpret_p16_p8 -+ -+poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t); -+#define vreinterpretq_p16_u32 -+ -+poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t); -+#define vreinterpretq_p16_u16 -+ -+poly16x8_t vreinterpretq_p16_s32 (int32x4_t t); -+#define vreinterpretq_p16_s32 -+ -+poly16x8_t vreinterpretq_p16_s16 (int16x8_t t); -+#define vreinterpretq_p16_s16 -+ -+poly16x8_t vreinterpretq_p16_s8 (int8x16_t t); -+#define vreinterpretq_p16_s8 -+ -+poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t); -+#define vreinterpretq_p16_u64 -+ -+poly16x8_t vreinterpretq_p16_s64 (int64x2_t t); -+#define vreinterpretq_p16_s64 -+ -+poly16x8_t vreinterpretq_p16_f32 (float32x4_t t); -+#define vreinterpretq_p16_f32(t) _M128i(t) -+ -+poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t); -+#define vreinterpretq_p16_p8 vreinterpretq_s16_p8 -+ -+//**** Integer to float ****** -+float32x2_t vreinterpret_f32_u32 (uint32x2_t t); -+#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t)) -+ -+ -+float32x2_t vreinterpret_f32_u16 (uint16x4_t t); -+#define vreinterpret_f32_u16 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_u8 (uint8x8_t t); -+#define vreinterpret_f32_u8 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_s32 (int32x2_t t); -+#define vreinterpret_f32_s32 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_s16 (int16x4_t t); -+#define vreinterpret_f32_s16 vreinterpret_f32_u32 -+ -+float32x2_t vreinterpret_f32_s8 (int8x8_t t); -+#define vreinterpret_f32_s8 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_u64(uint64x1_t t); -+#define vreinterpret_f32_u64 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_s64 (int64x1_t t); -+#define vreinterpret_f32_s64 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_p16 (poly16x4_t t); -+#define vreinterpret_f32_p16 vreinterpret_f32_u32 -+ -+float32x2_t vreinterpret_f32_p8 (poly8x8_t t); -+#define vreinterpret_f32_p8 vreinterpret_f32_u32 -+ -+float32x4_t vreinterpretq_f32_u32 (uint32x4_t t); -+#define vreinterpretq_f32_u32(t) *(__m128*)&(t) -+ -+float32x4_t vreinterpretq_f32_u16 (uint16x8_t t); -+#define vreinterpretq_f32_u16 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_u8 (uint8x16_t t); -+#define vreinterpretq_f32_u8 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s32 (int32x4_t t); -+#define vreinterpretq_f32_s32 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s16 (int16x8_t t); -+#define vreinterpretq_f32_s16 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s8 (int8x16_t t); -+#define vreinterpretq_f32_s8 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_u64 (uint64x2_t t); -+#define vreinterpretq_f32_u64 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s64 (int64x2_t t); -+#define vreinterpretq_f32_s64 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_p16 (poly16x8_t t); -+#define vreinterpretq_f32_p16 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_p8 (poly8x16_t t); -+#define vreinterpretq_f32_p8 vreinterpretq_f32_u32 -+ -+//*** Integer type conversions ****************** -+//no conversion necessary for the following functions because it is same data type -+int64x1_t vreinterpret_s64_u32 (uint32x2_t t); -+#define vreinterpret_s64_u32 -+ -+int64x1_t vreinterpret_s64_u16 (uint16x4_t t); -+#define vreinterpret_s64_u16 -+ -+int64x1_t vreinterpret_s64_u8 (uint8x8_t t); -+#define vreinterpret_s64_u8 -+ -+int64x1_t vreinterpret_s64_s32 (int32x2_t t); -+#define vreinterpret_s64_s32 -+ -+int64x1_t vreinterpret_s64_s16 (int16x4_t t); -+#define vreinterpret_s64_s16 -+ -+int64x1_t vreinterpret_s64_s8 (int8x8_t t); -+#define vreinterpret_s64_s8 -+ -+int64x1_t vreinterpret_s64_u64 (uint64x1_t t); -+#define vreinterpret_s64_u64 -+ -+int64x1_t vreinterpret_s64_f32 (float32x2_t t); -+#define vreinterpret_s64_f32 -+ -+int64x1_t vreinterpret_s64_p16 (poly16x4_t t); -+#define vreinterpret_s64_p16 -+ -+int64x1_t vreinterpret_s64_p8 (poly8x8_t t); -+#define vreinterpret_s64_p8 -+ -+int64x2_t vreinterpretq_s64_u32 (uint32x4_t t); -+#define vreinterpretq_s64_u32 -+ -+int64x2_t vreinterpretq_s64_s16 (uint16x8_t t); -+#define vreinterpretq_s64_s16 -+ -+int64x2_t vreinterpretq_s64_u8 (uint8x16_t t); -+#define vreinterpretq_s64_u8 -+ -+int64x2_t vreinterpretq_s64_s32 (int32x4_t t); -+#define vreinterpretq_s64_s32 -+ -+int64x2_t vreinterpretq_s64_u16 (int16x8_t t); -+#define vreinterpretq_s64_u16 -+ -+int64x2_t vreinterpretq_s64_s8 (int8x16_t t); -+#define vreinterpretq_s64_s8 -+ -+int64x2_t vreinterpretq_s64_u64 (uint64x2_t t); -+#define vreinterpretq_s64_u64 -+ -+int64x2_t vreinterpretq_s64_f32 (float32x4_t t); -+#define vreinterpretq_s64_f32(t) _M128i(t) -+ -+int64x2_t vreinterpretq_s64_p16 (poly16x8_t t); -+#define vreinterpretq_s64_p16 -+ -+int64x2_t vreinterpretq_s64_p8 (poly8x16_t t); -+#define vreinterpretq_s64_p8 -+ -+uint64x1_t vreinterpret_u64_u32 (uint32x2_t t); -+#define vreinterpret_u64_u32 -+ -+uint64x1_t vreinterpret_u64_u16 (uint16x4_t t); -+#define vreinterpret_u64_u16 -+ -+uint64x1_t vreinterpret_u64_u8 (uint8x8_t t); -+#define vreinterpret_u64_u8 -+ -+uint64x1_t vreinterpret_u64_s32 (int32x2_t t); -+#define vreinterpret_u64_s32 -+ -+uint64x1_t vreinterpret_u64_s16 (int16x4_t t); -+#define vreinterpret_u64_s16 -+ -+uint64x1_t vreinterpret_u64_s8 (int8x8_t t); -+#define vreinterpret_u64_s8 -+ -+uint64x1_t vreinterpret_u64_s64 (int64x1_t t); -+#define vreinterpret_u64_s64 -+ -+uint64x1_t vreinterpret_u64_f32 (float32x2_t t); -+#define vreinterpret_u64_f32 -+ -+uint64x1_t vreinterpret_u64_p16 (poly16x4_t t); -+#define vreinterpret_u64_p16 -+ -+uint64x1_t vreinterpret_u64_p8 (poly8x8_t t); -+#define vreinterpret_u64_p8 -+ -+uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t); -+#define vreinterpretq_u64_u32 -+ -+uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t); -+#define vreinterpretq_u64_u16 -+ -+uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t); -+#define vreinterpretq_u64_u8 -+ -+uint64x2_t vreinterpretq_u64_s32 (int32x4_t t); -+#define vreinterpretq_u64_s32 -+ -+uint64x2_t vreinterpretq_u64_s16 (int16x8_t t); -+#define vreinterpretq_u64_s16 -+ -+uint64x2_t vreinterpretq_u64_s8 (int8x16_t t); -+#define vreinterpretq_u64_s8 -+ -+uint64x2_t vreinterpretq_u64_s64 (int64x2_t t); -+#define vreinterpretq_u64_s64 -+ -+uint64x2_t vreinterpretq_u64_f32 (float32x4_t t); -+#define vreinterpretq_u64_f32(t) _M128i(t) -+ -+uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t); -+#define vreinterpretq_u64_p16 -+ -+uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t); -+#define vreinterpretq_u64_p8 -+ -+int8x8_t vreinterpret_s8_u32 (uint32x2_t t); -+#define vreinterpret_s8_u32 -+ -+int8x8_t vreinterpret_s8_u16 (uint16x4_t t); -+#define vreinterpret_s8_u16 -+ -+int8x8_t vreinterpret_s8_u8 (uint8x8_t t); -+#define vreinterpret_s8_u8 -+ -+int8x8_t vreinterpret_s8_s32 (int32x2_t t); -+#define vreinterpret_s8_s32 -+ -+int8x8_t vreinterpret_s8_s16 (int16x4_t t); -+#define vreinterpret_s8_s16 -+ -+int8x8_t vreinterpret_s8_u64 (uint64x1_t t); -+#define vreinterpret_s8_u64 -+ -+int8x8_t vreinterpret_s8_s64 (int64x1_t t); -+#define vreinterpret_s8_s64 -+ -+int8x8_t vreinterpret_s8_f32 (float32x2_t t); -+#define vreinterpret_s8_f32 -+ -+int8x8_t vreinterpret_s8_p16 (poly16x4_t t); -+#define vreinterpret_s8_p16 -+ -+int8x8_t vreinterpret_s8_p8 (poly8x8_t t); -+#define vreinterpret_s8_p8 -+ -+int8x16_t vreinterpretq_s8_u32 (uint32x4_t t); -+#define vreinterpretq_s8_u32 -+ -+int8x16_t vreinterpretq_s8_u16 (uint16x8_t t); -+#define vreinterpretq_s8_u16 -+ -+int8x16_t vreinterpretq_s8_u8 (uint8x16_t t); -+#define vreinterpretq_s8_u8 -+ -+int8x16_t vreinterpretq_s8_s32 (int32x4_t t); -+#define vreinterpretq_s8_s32 -+ -+int8x16_t vreinterpretq_s8_s16 (int16x8_t t); -+#define vreinterpretq_s8_s16 -+ -+int8x16_t vreinterpretq_s8_u64 (uint64x2_t t); -+#define vreinterpretq_s8_u64 -+ -+int8x16_t vreinterpretq_s8_s64 (int64x2_t t); -+#define vreinterpretq_s8_s64 -+ -+int8x16_t vreinterpretq_s8_f32 (float32x4_t t); -+#define vreinterpretq_s8_f32(t) _M128i(t) -+ -+int8x16_t vreinterpretq_s8_p16 (poly16x8_t t); -+#define vreinterpretq_s8_p16 -+ -+int8x16_t vreinterpretq_s8_p8 (poly8x16_t t); -+#define vreinterpretq_s8_p8 -+ -+int16x4_t vreinterpret_s16_u32 (uint32x2_t t); -+#define vreinterpret_s16_u32 -+ -+int16x4_t vreinterpret_s16_u16 (uint16x4_t t); -+#define vreinterpret_s16_u16 -+ -+int16x4_t vreinterpret_s16_u8 (uint8x8_t t); -+#define vreinterpret_s16_u8 -+ -+int16x4_t vreinterpret_s16_s32 (int32x2_t t); -+#define vreinterpret_s16_s32 -+ -+int16x4_t vreinterpret_s16_s8 (int8x8_t t); -+#define vreinterpret_s16_s8 -+ -+int16x4_t vreinterpret_s16_u64 (uint64x1_t t); -+#define vreinterpret_s16_u64 -+ -+int16x4_t vreinterpret_s16_s64 (int64x1_t t); -+#define vreinterpret_s16_s64 -+ -+int16x4_t vreinterpret_s16_f32 (float32x2_t t); -+#define vreinterpret_s16_f32 -+ -+ -+int16x4_t vreinterpret_s16_p16 (poly16x4_t t); -+#define vreinterpret_s16_p16 -+ -+int16x4_t vreinterpret_s16_p8 (poly8x8_t t); -+#define vreinterpret_s16_p8 -+ -+int16x8_t vreinterpretq_s16_u32 (uint32x4_t t); -+#define vreinterpretq_s16_u32 -+ -+int16x8_t vreinterpretq_s16_u16 (uint16x8_t t); -+#define vreinterpretq_s16_u16 -+ -+int16x8_t vreinterpretq_s16_u8 (uint8x16_t t); -+#define vreinterpretq_s16_u8 -+ -+int16x8_t vreinterpretq_s16_s32 (int32x4_t t); -+#define vreinterpretq_s16_s32 -+ -+int16x8_t vreinterpretq_s16_s8 (int8x16_t t); -+#define vreinterpretq_s16_s8 -+ -+int16x8_t vreinterpretq_s16_u64 (uint64x2_t t); -+#define vreinterpretq_s16_u64 -+ -+int16x8_t vreinterpretq_s16_s64 (int64x2_t t); -+#define vreinterpretq_s16_s64 -+ -+int16x8_t vreinterpretq_s16_f32 (float32x4_t t); -+#define vreinterpretq_s16_f32(t) _M128i(t) -+ -+int16x8_t vreinterpretq_s16_p16 (poly16x8_t t); -+#define vreinterpretq_s16_p16 -+ -+int16x8_t vreinterpretq_s16_p8 (poly8x16_t t); -+#define vreinterpretq_s16_p8 -+ -+int32x2_t vreinterpret_s32_u32 (uint32x2_t t); -+#define vreinterpret_s32_u32 -+ -+int32x2_t vreinterpret_s32_u16 (uint16x4_t t); -+#define vreinterpret_s32_u16 -+ -+int32x2_t vreinterpret_s32_u8 (uint8x8_t t); -+#define vreinterpret_s32_u8 -+ -+int32x2_t vreinterpret_s32_s16 (int16x4_t t); -+#define vreinterpret_s32_s16 -+ -+int32x2_t vreinterpret_s32_s8 (int8x8_t t); -+#define vreinterpret_s32_s8 -+ -+int32x2_t vreinterpret_s32_u64 (uint64x1_t t); -+#define vreinterpret_s32_u64 -+ -+int32x2_t vreinterpret_s32_s64 (int64x1_t t); -+#define vreinterpret_s32_s64 -+ -+int32x2_t vreinterpret_s32_f32 (float32x2_t t); -+#define vreinterpret_s32_f32 -+ -+int32x2_t vreinterpret_s32_p16 (poly16x4_t t); -+#define vreinterpret_s32_p16 -+ -+int32x2_t vreinterpret_s32_p8 (poly8x8_t t); -+#define vreinterpret_s32_p8 -+ -+int32x4_t vreinterpretq_s32_u32 (uint32x4_t t); -+#define vreinterpretq_s32_u32 -+ -+int32x4_t vreinterpretq_s32_u16 (uint16x8_t t); -+#define vreinterpretq_s32_u16 -+ -+int32x4_t vreinterpretq_s32_u8 (uint8x16_t t); -+#define vreinterpretq_s32_u8 -+ -+int32x4_t vreinterpretq_s32_s16 (int16x8_t t); -+#define vreinterpretq_s32_s16 -+ -+int32x4_t vreinterpretq_s32_s8 (int8x16_t t); -+#define vreinterpretq_s32_s8 -+ -+int32x4_t vreinterpretq_s32_u64 (uint64x2_t t); -+#define vreinterpretq_s32_u64 -+ -+int32x4_t vreinterpretq_s32_s64 (int64x2_t t); -+#define vreinterpretq_s32_s64 -+ -+int32x4_t vreinterpretq_s32_f32 (float32x4_t t); -+#define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t)) -+ -+int32x4_t vreinterpretq_s32_p16 (poly16x8_t t); -+#define vreinterpretq_s32_p16 -+ -+int32x4_t vreinterpretq_s32_p8 (poly8x16_t t); -+#define vreinterpretq_s32_p8 -+ -+uint8x8_t vreinterpret_u8_u32 (uint32x2_t t); -+#define vreinterpret_u8_u32 -+ -+uint8x8_t vreinterpret_u8_u16 (uint16x4_t t); -+#define vreinterpret_u8_u16 -+ -+uint8x8_t vreinterpret_u8_s32 (int32x2_t t); -+#define vreinterpret_u8_s32 -+ -+uint8x8_t vreinterpret_u8_s16 (int16x4_t t); -+#define vreinterpret_u8_s16 -+ -+uint8x8_t vreinterpret_u8_s8 (int8x8_t t); -+#define vreinterpret_u8_s8 -+ -+uint8x8_t vreinterpret_u8_u64 (uint64x1_t t); -+#define vreinterpret_u8_u64 -+ -+uint8x8_t vreinterpret_u8_s64 (int64x1_t t); -+#define vreinterpret_u8_s64 -+ -+uint8x8_t vreinterpret_u8_f32 (float32x2_t t); -+#define vreinterpret_u8_f32 -+ -+uint8x8_t vreinterpret_u8_p16 (poly16x4_t t); -+#define vreinterpret_u8_p16 -+ -+uint8x8_t vreinterpret_u8_p8 (poly8x8_t t); -+#define vreinterpret_u8_p8 -+ -+uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t); -+#define vreinterpretq_u8_u32 -+ -+uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t); -+#define vreinterpretq_u8_u16 -+ -+uint8x16_t vreinterpretq_u8_s32 (int32x4_t t); -+#define vreinterpretq_u8_s32 -+ -+uint8x16_t vreinterpretq_u8_s16 (int16x8_t t); -+#define vreinterpretq_u8_s16 -+ -+uint8x16_t vreinterpretq_u8_s8 (int8x16_t t); -+#define vreinterpretq_u8_s8 -+ -+uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t); -+#define vreinterpretq_u8_u64 -+ -+uint8x16_t vreinterpretq_u8_s64 (int64x2_t t); -+#define vreinterpretq_u8_s64 -+ -+uint8x16_t vreinterpretq_u8_f32 (float32x4_t t); -+#define vreinterpretq_u8_f32(t) _M128i(t) -+ -+ -+uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t); -+#define vreinterpretq_u8_p16 -+ -+uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t); -+#define vreinterpretq_u8_p8 -+ -+uint16x4_t vreinterpret_u16_u32 (uint32x2_t t); -+#define vreinterpret_u16_u32 -+ -+uint16x4_t vreinterpret_u16_u8 (uint8x8_t t); -+#define vreinterpret_u16_u8 -+ -+uint16x4_t vreinterpret_u16_s32 (int32x2_t t); -+#define vreinterpret_u16_s32 -+ -+uint16x4_t vreinterpret_u16_s16 (int16x4_t t); -+#define vreinterpret_u16_s16 -+ -+uint16x4_t vreinterpret_u16_s8 (int8x8_t t); -+#define vreinterpret_u16_s8 -+ -+uint16x4_t vreinterpret_u16_u64 (uint64x1_t t); -+#define vreinterpret_u16_u64 -+ -+uint16x4_t vreinterpret_u16_s64 (int64x1_t t); -+#define vreinterpret_u16_s64 -+ -+uint16x4_t vreinterpret_u16_f32 (float32x2_t t); -+#define vreinterpret_u16_f32 -+ -+uint16x4_t vreinterpret_u16_p16 (poly16x4_t t); -+#define vreinterpret_u16_p16 -+ -+uint16x4_t vreinterpret_u16_p8 (poly8x8_t t); -+#define vreinterpret_u16_p8 -+ -+uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t); -+#define vreinterpretq_u16_u32 -+ -+uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t); -+#define vreinterpretq_u16_u8 -+ -+uint16x8_t vreinterpretq_u16_s32 (int32x4_t t); -+#define vreinterpretq_u16_s32 -+ -+uint16x8_t vreinterpretq_u16_s16 (int16x8_t t); -+#define vreinterpretq_u16_s16 -+ -+uint16x8_t vreinterpretq_u16_s8 (int8x16_t t); -+#define vreinterpretq_u16_s8 -+ -+uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t); -+#define vreinterpretq_u16_u64 -+ -+uint16x8_t vreinterpretq_u16_s64 (int64x2_t t); -+#define vreinterpretq_u16_s64 -+ -+uint16x8_t vreinterpretq_u16_f32 (float32x4_t t); -+#define vreinterpretq_u16_f32(t) _M128i(t) -+ -+uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t); -+#define vreinterpretq_u16_p16 -+ -+uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t); -+#define vreinterpretq_u16_p8 -+ -+uint32x2_t vreinterpret_u32_u16 (uint16x4_t t); -+#define vreinterpret_u32_u16 -+ -+uint32x2_t vreinterpret_u32_u8 (uint8x8_t t); -+#define vreinterpret_u32_u8 -+ -+uint32x2_t vreinterpret_u32_s32 (int32x2_t t); -+#define vreinterpret_u32_s32 -+ -+uint32x2_t vreinterpret_u32_s16 (int16x4_t t); -+#define vreinterpret_u32_s16 -+ -+uint32x2_t vreinterpret_u32_s8 (int8x8_t t); -+#define vreinterpret_u32_s8 -+ -+uint32x2_t vreinterpret_u32_u64 (uint64x1_t t); -+#define vreinterpret_u32_u64 -+ -+uint32x2_t vreinterpret_u32_s64 (int64x1_t t); -+#define vreinterpret_u32_s64 -+ -+uint32x2_t vreinterpret_u32_f32 (float32x2_t t); -+#define vreinterpret_u32_f32 -+ -+uint32x2_t vreinterpret_u32_p16 (poly16x4_t t); -+#define vreinterpret_u32_p16 -+ -+uint32x2_t vreinterpret_u32_p8 (poly8x8_t t); -+#define vreinterpret_u32_p8 -+ -+uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t); -+#define vreinterpretq_u32_u16 -+ -+uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t); -+#define vreinterpretq_u32_u8 -+ -+uint32x4_t vreinterpretq_u32_s32 (int32x4_t t); -+#define vreinterpretq_u32_s32 -+ -+uint32x4_t vreinterpretq_u32_s16 (int16x8_t t); -+#define vreinterpretq_u32_s16 -+ -+uint32x4_t vreinterpretq_u32_s8 (int8x16_t t); -+#define vreinterpretq_u32_s8 -+ -+uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t); -+#define vreinterpretq_u32_u64 -+ -+uint32x4_t vreinterpretq_u32_s64 (int64x2_t t); -+#define vreinterpretq_u32_s64 -+ -+uint32x4_t vreinterpretq_u32_f32 (float32x4_t t); -+#define vreinterpretq_u32_f32(t) _M128i(t) -+ -+uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t); -+#define vreinterpretq_u32_p16 -+ -+uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); -+#define vreinterpretq_u32_p8 -+ -+#endif /* NEON2SSE_H */ -diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h -index fee33a3..22fb2ce 100644 ---- a/gcc/config/i386/gnu-user.h -+++ b/gcc/config/i386/gnu-user.h -@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see - When the -shared link option is used a final link is not being - done. */ - -+#undef ANDROID_TARGET_CC1_SPEC -+#define ANDROID_TARGET_CC1_SPEC \ -+ " -mssse3 -fno-short-enums " \ -+ - #undef ASM_SPEC - #define ASM_SPEC \ -- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" -+ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) - - #undef SUBTARGET_EXTRA_SPECS - #define SUBTARGET_EXTRA_SPECS \ -diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h -index 7a02a7e..cac4179 100644 ---- a/gcc/config/i386/gnu-user64.h -+++ b/gcc/config/i386/gnu-user64.h -@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define SPEC_X32 "mx32" - #endif - -+#undef ANDROID_TARGET_CC1_SPEC -+#define ANDROID_TARGET_CC1_SPEC \ -+ "%{m32:-mssse3 -fno-short-enums}" \ -+ "%{!m32:-msse4.2 -mpopcnt}" -+ - #undef ASM_SPEC - #define ASM_SPEC "%{" SPEC_32 ":--32} \ - %{" SPEC_64 ":--64} \ -diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c -index 3d044e8..5c89fca 100644 ---- a/gcc/config/i386/i386.c -+++ b/gcc/config/i386/i386.c -@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp) - else if (!SYMBOL_REF_FAR_ADDR_P (op0) - && (SYMBOL_REF_LOCAL_P (op0) - || (HAVE_LD_PIE_COPYRELOC -+ && !TARGET_HAS_BIONIC - && flag_pie - && !SYMBOL_REF_WEAK (op0) - && !SYMBOL_REF_FUNCTION_P (op0))) -diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h -index 4b9910f..c33e074 100644 ---- a/gcc/config/i386/linux-common.h -+++ b/gcc/config/i386/linux-common.h -@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see - #undef CC1_SPEC - #define CC1_SPEC \ - LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ -- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) -+ GNU_USER_TARGET_CC1_SPEC \ -+ ANDROID_TARGET_CC1_SPEC \ -+ " " \ -+ ANDROID_CC1_SPEC("-fPIC")) -+ -+#define CC1PLUS_SPEC \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) - - #undef LINK_SPEC - #define LINK_SPEC \ -diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h -index a1f98d3..3725799 100644 ---- a/gcc/config/i386/pmm_malloc.h -+++ b/gcc/config/i386/pmm_malloc.h -@@ -31,7 +31,7 @@ - #ifndef __cplusplus - extern int posix_memalign (void **, size_t, size_t); - #else --extern "C" int posix_memalign (void **, size_t, size_t) throw (); -+extern "C" int posix_memalign (void **, size_t, size_t); - #endif - - static __inline void * -diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h -index 301a41c..bebfe7f 100644 ---- a/gcc/config/linux-android.h -+++ b/gcc/config/linux-android.h -@@ -38,15 +38,18 @@ - "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}" - - #define ANDROID_LINK_SPEC \ -- "%{shared: -Bsymbolic}" -+ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now" - --#define ANDROID_CC1_SPEC \ -+#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \ - "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \ -- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}" -+ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}" - - #define ANDROID_CC1PLUS_SPEC \ -- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \ -- "%{!frtti:%{!fno-rtti: -fno-rtti}}" -+ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \ -+ "%{!frtti:%{!fno-rtti: -frtti}}" -+ -+#define ANDROID_ASM_SPEC \ -+ "--noexecstack" - - #define ANDROID_LIB_SPEC \ - "%{!static: -ldl}" -diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h -new file mode 100644 -index 0000000..32c539c ---- /dev/null -+++ b/gcc/config/mips/android.h -@@ -0,0 +1,49 @@ -+/* Target macros for mips*-*android* targets. -+ Copyright (C) 2014 Free Software Foundation, Inc. -+ -+This file is part of GCC. -+ -+GCC is free software; you can redistribute it and/or modify -+it under the terms of the GNU General Public License as published by -+the Free Software Foundation; either version 3, or (at your option) -+any later version. -+ -+GCC is distributed in the hope that it will be useful, -+but WITHOUT ANY WARRANTY; without even the implied warranty of -+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+GNU General Public License for more details. -+ -+You should have received a copy of the GNU General Public License -+along with GCC; see the file COPYING3. If not see -+. */ -+ -+#undef DRIVER_SELF_SPECS -+#define DRIVER_SELF_SPECS \ -+ /* Make sure a -mips option is present. This helps us to pick \ -+ the right multilib, and also makes the later specs easier \ -+ to write. */ \ -+ MIPS_ISA_LEVEL_SPEC, \ -+ \ -+ /* Infer the default float setting from -march. */ \ -+ MIPS_ARCH_FLOAT_SPEC, \ -+ \ -+ /* Infer the -msynci setting from -march if not explicitly set. */ \ -+ MIPS_ISA_SYNCI_SPEC, \ -+ \ -+ /* If no ABI option is specified, infer one from the ISA level \ -+ or -mgp setting. */ \ -+ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \ -+ \ -+ /* If no FP ABI option is specified, infer one from the \ -+ ABI/ISA level unless there is a conflicting option. */ \ -+ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \ -+ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \ -+ \ -+ /* If no odd-spreg option is specified, infer one from the ISA. */ \ -+ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \ -+ \ -+ /* Base SPECs. */ \ -+ BASE_DRIVER_SELF_SPECS, \ -+ \ -+ /* Use the standard linux specs for everything else. */ \ -+ LINUX_DRIVER_SELF_SPECS -diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h -index 15b549c..4a28160 100644 ---- a/gcc/config/mips/gnu-user.h -+++ b/gcc/config/mips/gnu-user.h -@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see - /* The GNU C++ standard library requires this. */ \ - if (c_dialect_cxx ()) \ - builtin_define ("_GNU_SOURCE"); \ -+ ANDROID_TARGET_OS_CPP_BUILTINS(); \ - } while (0) - - #undef SUBTARGET_CPP_SPEC -@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see - - #undef SUBTARGET_ASM_SPEC - #define SUBTARGET_ASM_SPEC \ -- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}" -+ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) - - /* The MIPS assembler has different syntax for .set. We set it to - .dummy to trap any errors. */ -@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); - #endif - - #define LINUX_DRIVER_SELF_SPECS \ -- NO_SHARED_SPECS \ -+ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \ - MARCH_MTUNE_NATIVE_SPECS, \ - /* -mplt has no effect without -mno-shared. Simplify later \ - specs handling by removing a redundant option. */ \ -diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h -index 8429a7c..4b0825d 100644 ---- a/gcc/config/mips/linux-common.h -+++ b/gcc/config/mips/linux-common.h -@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see - #undef SUBTARGET_CC1_SPEC - #define SUBTARGET_CC1_SPEC \ - LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ -- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) -+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) - - #undef CC1PLUS_SPEC - #define CC1PLUS_SPEC \ -@@ -62,3 +62,7 @@ along with GCC; see the file COPYING3. If not see - - /* The default value isn't sufficient in 64-bit mode. */ - #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) -+ -+#ifdef IN_LIBGCC2 -+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) -+#endif -diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android -new file mode 100644 -index 0000000..39f512c ---- /dev/null -+++ b/gcc/config/mips/t-linux-android -@@ -0,0 +1,3 @@ -+MULTILIB_OPTIONS = mips32r2/mips32r6 -+MULTILIB_DIRNAMES = mips-r2 mips-r6 -+MULTILIB_OSDIRNAMES = ../libr2 ../libr6 -diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64 -new file mode 100644 -index 0000000..55cab7d ---- /dev/null -+++ b/gcc/config/mips/t-linux-android64 -@@ -0,0 +1,4 @@ -+MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6 -+MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6 -+MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64 -+MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6 -diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h -index 37ecfc4..a5f1b99 100644 ---- a/gcc/config/openbsd.h -+++ b/gcc/config/openbsd.h -@@ -136,8 +136,12 @@ while (0) - #define LIB_SPEC OBSD_LIB_SPEC - - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif - - #undef LIB_SPEC - #define LIB_SPEC OBSD_LIB_SPEC -diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h -index cbf9097..eb2217f 100644 ---- a/gcc/config/rs6000/sysv4.h -+++ b/gcc/config/rs6000/sysv4.h -@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN) - -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}" - - #if defined(HAVE_LD_EH_FRAME_HDR) --# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " -+# ifdef USE_EH_FRAME_HDR_FOR_STATIC -+# define LINK_EH_SPEC "--eh-frame-hdr " -+# else -+# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " -+# endif - #endif - - #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \ -diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h -index 5160e1f..7632a50 100644 ---- a/gcc/config/sol2.h -+++ b/gcc/config/sol2.h -@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see - /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs - --eh-frame-hdr to create the required .eh_frame_hdr sections. */ - #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " -+#endif - #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */ - #endif - -diff --git a/gcc/configure b/gcc/configure -index 1c6e340..28ad050 100755 ---- a/gcc/configure -+++ b/gcc/configure -@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769 - enable_fix_cortex_a53_843419 - with_glibc_version - enable_gnu_unique_object -+enable_eh_frame_hdr_for_static - enable_linker_build_id - enable_default_ssp - with_long_double_128 -@@ -1670,6 +1671,9 @@ Optional Features: - --enable-gnu-unique-object - enable the use of the @gnu_unique_object ELF - extension on glibc systems -+ --enable-eh-frame-hdr-for-static -+ enable linker PT_GNU_EH_FRAME support for static -+ executable - --enable-linker-build-id - compiler will always pass --build-id to linker - --enable-default-ssp enable Stack Smashing Protection as default -@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then - - $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h - -+ # Check whether --enable-eh-frame-hdr-for-static was given. -+if test "${enable_eh_frame_hdr_for_static+set}" = set; then : -+ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in -+ yes | no) ;; -+ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid -+value for --enable-eh-frame-hdr-for-static. -+Valid choices are 'yes' and 'no'." "$LINENO" 5 ;; -+ esac -+else -+ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from -+# Linux kernel. -+ if test x$host = x$build -a x$host = x$target && -+ ldd --version 2>&1 >/dev/null && -+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then -+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` -+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` -+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` -+ if test "$glibcnum" -ge 2003 ; then -+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` -+ if echo "$auvx" | grep AT_PHDR > /dev/null && -+ echo "$auvx" | grep AT_PHNUM > /dev/null; then -+ enable_eh_frame_hdr_for_static=yes -+ fi -+ fi -+ fi -+fi -+ -+ if test x$enable_eh_frame_hdr_for_static = xyes; then -+ -+$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h -+ -+ fi - fi - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5 - $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; } -diff --git a/gcc/configure.ac b/gcc/configure.ac -index 6c1dcd9..0cf7419 100644 ---- a/gcc/configure.ac -+++ b/gcc/configure.ac -@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR]) - if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then - AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1, - [Define if your linker supports .eh_frame_hdr.]) -+ AC_ARG_ENABLE(eh-frame-hdr-for-static, -+ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static], -+ [enable linker PT_GNU_EH_FRAME support for static executable])], -+ [case $enable_eh_frame_hdr_for_static in -+ yes | no) ;; -+ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid -+value for --enable-eh-frame-hdr-for-static. -+Valid choices are 'yes' and 'no'.]) ;; -+ esac], -+# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from -+# Linux kernel. -+ [[if test x$host = x$build -a x$host = x$target && -+ ldd --version 2>&1 >/dev/null && -+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then -+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` -+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` -+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` -+ if test "$glibcnum" -ge 2003 ; then -+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` -+ if echo "$auvx" | grep AT_PHDR > /dev/null && -+ echo "$auvx" | grep AT_PHNUM > /dev/null; then -+ enable_eh_frame_hdr_for_static=yes -+ fi -+ fi -+ fi]]) -+ if test x$enable_eh_frame_hdr_for_static = xyes; then -+ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1, -+[Define if your system supports PT_GNU_EH_FRAME for static executable.]) -+ fi - fi - AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr) - -diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C -new file mode 100644 -index 0000000..15408ef ---- /dev/null -+++ b/gcc/testsuite/g++.dg/eh/spec3-static.C -@@ -0,0 +1,25 @@ -+// PR c++/4381 -+// Test that exception-specs work properly for classes with virtual bases. -+ -+// { dg-do run } -+// { dg-options "-static" } -+ -+class Base {}; -+ -+struct A : virtual public Base -+{ -+ A() {} -+}; -+ -+struct B {}; -+ -+void func() throw (B,A) -+{ -+ throw A(); -+} -+ -+int main(void) -+{ -+ try { func(); } -+ catch (A& a) { } -+} -diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c -index f3343fc..d426477 100644 ---- a/libgcc/crtstuff.c -+++ b/libgcc/crtstuff.c -@@ -88,7 +88,8 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) \ -- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ -+ && !defined(inhibit_libc) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(BSD_DL_ITERATE_PHDR_AVAILABLE) - #include - # define USE_PT_GNU_EH_FRAME -@@ -97,7 +98,8 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \ -- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ -+ && !defined(inhibit_libc) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(__sun__) && defined(__svr4__) - #include - # define USE_PT_GNU_EH_FRAME -@@ -106,7 +108,8 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) \ -- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ -+ && !defined(inhibit_libc) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(__GLIBC__) && __GLIBC__ >= 2 - #include - /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h. -@@ -121,7 +124,7 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) \ -- && !defined(CRTSTUFFT_O) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(inhibit_libc) \ - && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__)) - /* On systems using glibc, an inhibit_libc build of libgcc is only -diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h -index 555c0fe..47c8655 100644 ---- a/libgcc/gthr-posix.h -+++ b/libgcc/gthr-posix.h -@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define __GTHREADS 1 - #define __GTHREADS_CXX0X 1 - -+/* The following should normally be in a different header file, -+ * but I couldn't find the right location. The point of the macro -+ * definition below is to prevent libsupc++ and libstdc++ to reference -+ * weak symbols in their static C++ constructors. Such code crashes -+ * when a shared object linked statically to these libraries is -+ * loaded on Android 2.1 (Eclair) and older platform releases, due -+ * to a dynamic linker bug. -+ */ -+#ifdef __ANDROID__ -+#undef GTHREAD_USE_WEAK -+#define GTHREAD_USE_WEAK 0 -+#endif -+ - #include - - #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \ -diff --git a/libgcc/gthr.h b/libgcc/gthr.h -index 47a7d06..67a680f 100644 ---- a/libgcc/gthr.h -+++ b/libgcc/gthr.h -@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define GTHREAD_USE_WEAK 1 - #endif - #endif -+#if __ANDROID__ -+#include "gthr-posix.h" -+#else - #include "gthr-default.h" -+#endif - - #ifndef HIDE_EXPORTS - #pragma GCC visibility pop -diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure -index 41797a9..5a631dd 100755 ---- a/libstdc++-v3/configure -+++ b/libstdc++-v3/configure -@@ -78319,6 +78319,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext - /* end confdefs.h. */ - #include - int lk; -+#if !defined(SYS_gettid) -+#define SYS_gettid __NR_gettid -+#endif -+#if !defined(SYS_futex) -+#define SYS_futex __NR_futex -+#endif - int - main () - { -@@ -78377,6 +78383,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext - /* end confdefs.h. */ - #include - int lk; -+#if !defined(SYS_gettid) -+#define SYS_gettid __NR_gettid -+#endif -+#if !defined(SYS_futex) -+#define SYS_futex __NR_futex -+#endif - int - main () - { -diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h -index e3e206b..e85dc2c 100644 ---- a/libstdc++-v3/include/bits/locale_facets.h -+++ b/libstdc++-v3/include/bits/locale_facets.h -@@ -47,6 +47,20 @@ - #include - #include - -+#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__ -+// CrystaX: for some reason, x86 gcc-4.9 makes ctype::do_widen() and -+// ctype::_M_widen_init() methods working wrong if optimization enabled. -+// For ctype::do_widen(), values of passed arguments (__lo, __hi and __to) -+// are completely messed up and don't correspond to passed values. In case if -+// we disable optimization for those methods, things become correct so we apply -+// this workaround here for a time. -+// TODO: figure out what exactly wrong here - is it bug in GCC optimization -+// algorithm or smth else? -+#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0))) -+#else -+#define __CRYSTAX_X86_DONT_OPTIMIZE -+#endif -+ - namespace std _GLIBCXX_VISIBILITY(default) - { - _GLIBCXX_BEGIN_NAMESPACE_VERSION -@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION - * @return @a __hi. - */ - virtual const char* -- do_widen(const char* __lo, const char* __hi, char_type* __to) const -+ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE - { - __builtin_memcpy(__to, __lo, __hi - __lo); - return __hi; -@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION - - private: - void _M_narrow_init() const; -- void _M_widen_init() const; -+ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE; - }; - - #ifdef _GLIBCXX_USE_WCHAR_T -diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc -index 9b61799..c149169 100644 ---- a/libstdc++-v3/libsupc++/guard.cc -+++ b/libstdc++-v3/libsupc++/guard.cc -@@ -33,7 +33,12 @@ - #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \ - && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX) - # include -+#if defined(__ANDROID__) -+# include -+# define SYS_futex __NR_futex -+#else - # include -+#endif - # include - # define _GLIBCXX_USE_FUTEX - # define _GLIBCXX_FUTEX_WAIT 0 diff --git a/patches/gcc/6.3.0/970-crystax.patch b/patches/gcc/6.3.0/970-crystax.patch new file mode 100644 index 0000000..e3109cc --- /dev/null +++ b/patches/gcc/6.3.0/970-crystax.patch @@ -0,0 +1,551 @@ +commit 080803512c8f6f87c2f1f711170d54033144d628 +Author: Dmitry Moskalchuk +Date: Wed Jul 29 11:28:29 2015 +0300 + + [android] Apply Android-related modifications + + Signed-off-by: Dmitry Moskalchuk + +[Edited: keep libstdc++, drop libcrystax-related modifications] +diff --git a/gcc/config.gcc b/gcc/config.gcc +index f66e48cd1..1c253496b 100644 +--- a/gcc/config.gcc ++++ b/gcc/config.gcc +@@ -942,13 +942,17 @@ aarch64*-*-elf | aarch64*-*-rtems*) + TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'` + ;; + aarch64*-*-linux*) +- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h" ++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h" + tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h" ++ extra_options="${extra_options} linux-android.opt" + tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux" + case $target in + aarch64_be-*) + tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1" + ;; ++ aarch64*-*-linux-android*) ++ tm_file="${tm_file} aarch64/aarch64-linux-android.h" ++ ;; + esac + aarch64_multilibs="${with_multilib_list}" + if test "$aarch64_multilibs" = "default"; then +@@ -2055,6 +2059,17 @@ mips*-*-linux*) # Linux MIPS, either endian. + tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h" + extra_options="${extra_options} linux-android.opt" + case ${target} in ++ mips64*android*) ++ default_mips_arch=mips64r6 ++ default_mips_abi=64 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="${tmake_file} mips/t-linux-android64" ++ ;; ++ mips*android*) ++ default_mips_arch=mips32 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="$tmake_file mips/t-linux-android" ++ ;; + mipsisa32r6*) + default_mips_arch=mips32r6 + ;; +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +new file mode 100644 +index 000000000..db1288fd0 +--- /dev/null ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -0,0 +1,59 @@ ++/* Machine description for AArch64 architecture. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ GCC is distributed in the hope that it will be useful, but ++ WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING3. If not see ++ . */ ++ ++#ifndef GCC_AARCH64_LINUX_ANDROID_H ++#define GCC_AARCH64_LINUX_ANDROID_H ++ ++ ++#undef TARGET_OS_CPP_BUILTINS ++#define TARGET_OS_CPP_BUILTINS() \ ++ do \ ++ { \ ++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ ++ } \ ++ while (0) ++ ++#undef LINK_SPEC ++#define LINK_SPEC \ ++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ ++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) ++ ++#undef CC1_SPEC ++#define CC1_SPEC \ ++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) ++ ++#undef LIB_SPEC ++#define LIB_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ ++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC) ++ ++#undef STARTFILE_SPEC ++#define STARTFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) ++ ++#undef ENDFILE_SPEC ++#define ENDFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) ++ ++#endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h +index 5fcaa59a3..6864195ee 100644 +--- a/gcc/config/aarch64/aarch64-linux.h ++++ b/gcc/config/aarch64/aarch64-linux.h +@@ -21,7 +21,14 @@ + #ifndef GCC_AARCH64_LINUX_H + #define GCC_AARCH64_LINUX_H + +-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifndef RUNTIME_ROOT_PREFIX ++#define RUNTIME_ROOT_PREFIX "" ++#endif ++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifdef BIONIC_DYNAMIC_LINKER ++#undef BIONIC_DYNAMIC_LINKER ++#endif ++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64" + + #undef MUSL_DYNAMIC_LINKER + #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" +diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h +index ad123dde9..97b059de6 100644 +--- a/gcc/config/arm/arm.h ++++ b/gcc/config/arm/arm.h +@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes + + #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ + || (TARGET_THUMB1 \ ++ && !inline_thumb1_jump_table \ + && (optimize_size || flag_pic))) + + #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ +- (TARGET_THUMB1 \ ++ (TARGET_THUMB1 && !inline_thumb1_jump_table \ + ? (min >= 0 && max < 512 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ + : min >= -256 && max < 256 \ +diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md +index 47171b996..eb22d1181 100644 +--- a/gcc/config/arm/arm.md ++++ b/gcc/config/arm/arm.md +@@ -8179,7 +8179,7 @@ + (match_operand:SI 2 "const_int_operand" "") ; total range + (match_operand:SI 3 "" "") ; table label + (match_operand:SI 4 "" "")] ; Out of range label +- "TARGET_32BIT || optimize_size || flag_pic" ++ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)" + " + { + enum insn_code code; +diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt +index 0ebe01743..772453889 100644 +--- a/gcc/config/arm/arm.opt ++++ b/gcc/config/arm/arm.opt +@@ -193,6 +193,10 @@ mthumb-interwork + Target Report Mask(INTERWORK) + Support calls between Thumb and ARM instruction sets. + ++minline-thumb1-jumptable ++Target Report Var(inline_thumb1_jump_table) ++Inline Thumb1 Jump table code ++ + mtls-dialect= + Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU) + Specify thread local storage scheme. +diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h +index 77f30554d..32158ed65 100644 +--- a/gcc/config/arm/elf.h ++++ b/gcc/config/arm/elf.h +@@ -56,8 +56,7 @@ + #undef SUBSUBTARGET_EXTRA_SPECS + #define SUBSUBTARGET_EXTRA_SPECS + +-#ifndef ASM_SPEC +-#define ASM_SPEC "\ ++#define DEFAULT_ASM_SPEC "\ + %{mbig-endian:-EB} \ + %{mlittle-endian:-EL} \ + %(asm_cpu_spec) \ +@@ -66,6 +65,9 @@ + %{mthumb-interwork:-mthumb-interwork} \ + %{mfloat-abi=*} %{mfpu=*} \ + %(subtarget_extra_asm_spec)" ++ ++#ifndef ASM_SPEC ++#define ASM_SPEC DEFAULT_ASM_SPEC + #endif + + /* The ARM uses @ are a comment character so we need to redefine +@@ -104,7 +106,8 @@ + the code more efficient, but for Thumb-1 it's better to put them out of + band unless we are generating compressed tables. */ + #define JUMP_TABLES_IN_TEXT_SECTION \ +- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) ++ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \ ++ && (optimize_size || flag_pic))) + + #ifndef LINK_SPEC + #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" +diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h +index ace84816e..8c8fa6553 100644 +--- a/gcc/config/arm/linux-eabi.h ++++ b/gcc/config/arm/linux-eabi.h +@@ -108,11 +108,16 @@ + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \ + GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \ +- ANDROID_CC1_SPEC) ++ ANDROID_CC1_SPEC("-fpic")) + + #define CC1PLUS_SPEC \ + LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + ++#undef ASM_SPEC ++#define ASM_SPEC \ ++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \ ++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC) ++ + #undef LIB_SPEC + #define LIB_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ +diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h +index fee33a3ef..22fb2ced9 100644 +--- a/gcc/config/i386/gnu-user.h ++++ b/gcc/config/i386/gnu-user.h +@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see + When the -shared link option is used a final link is not being + done. */ + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ " -mssse3 -fno-short-enums " \ ++ + #undef ASM_SPEC + #define ASM_SPEC \ +- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" ++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + #undef SUBTARGET_EXTRA_SPECS + #define SUBTARGET_EXTRA_SPECS \ +diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h +index 7a02a7eb4..cac4179bc 100644 +--- a/gcc/config/i386/gnu-user64.h ++++ b/gcc/config/i386/gnu-user64.h +@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define SPEC_X32 "mx32" + #endif + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ "%{m32:-mssse3 -fno-short-enums}" \ ++ "%{!m32:-msse4.2 -mpopcnt}" ++ + #undef ASM_SPEC + #define ASM_SPEC "%{" SPEC_32 ":--32} \ + %{" SPEC_64 ":--64} \ +diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h +index 4b9910fa9..3b11ed086 100644 +--- a/gcc/config/i386/linux-common.h ++++ b/gcc/config/i386/linux-common.h +@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see + #undef CC1_SPEC + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC \ ++ ANDROID_TARGET_CC1_SPEC \ ++ " " \ ++ ANDROID_CC1_SPEC("-fPIC")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + + #undef LINK_SPEC + #define LINK_SPEC \ +diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h +index 301a41ccd..9623c88d0 100644 +--- a/gcc/config/linux-android.h ++++ b/gcc/config/linux-android.h +@@ -38,15 +39,18 @@ + "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}" + + #define ANDROID_LINK_SPEC \ +- "%{shared: -Bsymbolic}" ++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now" + +-#define ANDROID_CC1_SPEC \ ++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \ + "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \ +- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}" ++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}" + + #define ANDROID_CC1PLUS_SPEC \ +- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \ +- "%{!frtti:%{!fno-rtti: -fno-rtti}}" ++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \ ++ "%{!frtti:%{!fno-rtti: -frtti}}" ++ ++#define ANDROID_ASM_SPEC \ ++ "--noexecstack" + + #define ANDROID_LIB_SPEC \ + "%{!static: -ldl}" +diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h +new file mode 100644 +index 000000000..32c539c8d +--- /dev/null ++++ b/gcc/config/mips/android.h +@@ -0,0 +1,49 @@ ++/* Target macros for mips*-*android* targets. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; either version 3, or (at your option) ++any later version. ++ ++GCC is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#undef DRIVER_SELF_SPECS ++#define DRIVER_SELF_SPECS \ ++ /* Make sure a -mips option is present. This helps us to pick \ ++ the right multilib, and also makes the later specs easier \ ++ to write. */ \ ++ MIPS_ISA_LEVEL_SPEC, \ ++ \ ++ /* Infer the default float setting from -march. */ \ ++ MIPS_ARCH_FLOAT_SPEC, \ ++ \ ++ /* Infer the -msynci setting from -march if not explicitly set. */ \ ++ MIPS_ISA_SYNCI_SPEC, \ ++ \ ++ /* If no ABI option is specified, infer one from the ISA level \ ++ or -mgp setting. */ \ ++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \ ++ \ ++ /* If no FP ABI option is specified, infer one from the \ ++ ABI/ISA level unless there is a conflicting option. */ \ ++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \ ++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \ ++ \ ++ /* If no odd-spreg option is specified, infer one from the ISA. */ \ ++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \ ++ \ ++ /* Base SPECs. */ \ ++ BASE_DRIVER_SELF_SPECS, \ ++ \ ++ /* Use the standard linux specs for everything else. */ \ ++ LINUX_DRIVER_SELF_SPECS +diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h +index 15b549c08..4a2816014 100644 +--- a/gcc/config/mips/gnu-user.h ++++ b/gcc/config/mips/gnu-user.h +@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see + /* The GNU C++ standard library requires this. */ \ + if (c_dialect_cxx ()) \ + builtin_define ("_GNU_SOURCE"); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ + } while (0) + + #undef SUBTARGET_CPP_SPEC +@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see + + #undef SUBTARGET_ASM_SPEC + #define SUBTARGET_ASM_SPEC \ +- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}" ++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + /* The MIPS assembler has different syntax for .set. We set it to + .dummy to trap any errors. */ +@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); + #endif + + #define LINUX_DRIVER_SELF_SPECS \ +- NO_SHARED_SPECS \ ++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \ + MARCH_MTUNE_NATIVE_SPECS, \ + /* -mplt has no effect without -mno-shared. Simplify later \ + specs handling by removing a redundant option. */ \ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8429a7ca2..8bfacf994 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see + #undef SUBTARGET_CC1_SPEC + #define SUBTARGET_CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) + + #undef CC1PLUS_SPEC + #define CC1PLUS_SPEC \ +diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android +new file mode 100644 +index 000000000..39f512c81 +--- /dev/null ++++ b/gcc/config/mips/t-linux-android +@@ -0,0 +1,3 @@ ++MULTILIB_OPTIONS = mips32r2/mips32r6 ++MULTILIB_DIRNAMES = mips-r2 mips-r6 ++MULTILIB_OSDIRNAMES = ../libr2 ../libr6 +diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64 +new file mode 100644 +index 000000000..55cab7d62 +--- /dev/null ++++ b/gcc/config/mips/t-linux-android64 +@@ -0,0 +1,4 @@ ++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6 ++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6 ++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64 ++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6 +diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h +index 555c0fe24..47c8655f9 100644 +--- a/libgcc/gthr-posix.h ++++ b/libgcc/gthr-posix.h +@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define __GTHREADS 1 + #define __GTHREADS_CXX0X 1 + ++/* The following should normally be in a different header file, ++ * but I couldn't find the right location. The point of the macro ++ * definition below is to prevent libsupc++ and libstdc++ to reference ++ * weak symbols in their static C++ constructors. Such code crashes ++ * when a shared object linked statically to these libraries is ++ * loaded on Android 2.1 (Eclair) and older platform releases, due ++ * to a dynamic linker bug. ++ */ ++#ifdef __ANDROID__ ++#undef GTHREAD_USE_WEAK ++#define GTHREAD_USE_WEAK 0 ++#endif ++ + #include + + #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \ +diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure +index 41797a971..f746e8353 100755 +--- a/libstdc++-v3/configure ++++ b/libstdc++-v3/configure +@@ -78319,6 +78341,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +@@ -78377,6 +78405,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h +index e3e206b7d..e85dc2c76 100644 +--- a/libstdc++-v3/include/bits/locale_facets.h ++++ b/libstdc++-v3/include/bits/locale_facets.h +@@ -47,6 +47,20 @@ + #include + #include + ++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__ ++// CrystaX: for some reason, x86 gcc-4.9 makes ctype::do_widen() and ++// ctype::_M_widen_init() methods working wrong if optimization enabled. ++// For ctype::do_widen(), values of passed arguments (__lo, __hi and __to) ++// are completely messed up and don't correspond to passed values. In case if ++// we disable optimization for those methods, things become correct so we apply ++// this workaround here for a time. ++// TODO: figure out what exactly wrong here - is it bug in GCC optimization ++// algorithm or smth else? ++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0))) ++#else ++#define __CRYSTAX_X86_DONT_OPTIMIZE ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + * @return @a __hi. + */ + virtual const char* +- do_widen(const char* __lo, const char* __hi, char_type* __to) const ++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE + { + __builtin_memcpy(__to, __lo, __hi - __lo); + return __hi; +@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + + private: + void _M_narrow_init() const; +- void _M_widen_init() const; ++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE; + }; + + #ifdef _GLIBCXX_USE_WCHAR_T +diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc +index 9b617998f..c149169bb 100644 +--- a/libstdc++-v3/libsupc++/guard.cc ++++ b/libstdc++-v3/libsupc++/guard.cc +@@ -33,7 +33,12 @@ + #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \ + && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX) + # include ++#if defined(__ANDROID__) ++# include ++# define SYS_futex __NR_futex ++#else + # include ++#endif + # include + # define _GLIBCXX_USE_FUTEX + # define _GLIBCXX_FUTEX_WAIT 0 diff --git a/patches/gcc/6.3.0/971-crystax.patch b/patches/gcc/6.3.0/971-crystax.patch new file mode 100644 index 0000000..748a381 --- /dev/null +++ b/patches/gcc/6.3.0/971-crystax.patch @@ -0,0 +1,25 @@ +commit 9f057b62caafe08c968103d39b5df82486a175c2 +Author: Dmitry Moskalchuk +Date: Thu Aug 13 16:11:54 2015 +0300 + + [android] Add additional multilib option: mfloat-abi=hard + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi +index 8f1307c55..cbbec5bd2 100644 +--- a/gcc/config/arm/t-linux-androideabi ++++ b/gcc/config/arm/t-linux-androideabi +@@ -1,8 +1,9 @@ +-MULTILIB_OPTIONS = march=armv7-a mthumb +-MULTILIB_DIRNAMES = armv7-a thumb +-MULTILIB_EXCEPTIONS = ++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard ++MULTILIB_DIRNAMES = armv7-a thumb hard ++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard* + MULTILIB_MATCHES = + MULTILIB_OSDIRNAMES = ++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch + + # The "special" multilib can be used to build native applications for Android, + # as opposed to native shared libraries that are then called via JNI. diff --git a/patches/gcc/6.3.0/972-crystax.patch b/patches/gcc/6.3.0/972-crystax.patch new file mode 100644 index 0000000..b9077be --- /dev/null +++ b/patches/gcc/6.3.0/972-crystax.patch @@ -0,0 +1,302 @@ +commit 44a81ebb7698dac41ffa7acd5e0cc1578e5ab1fd +Author: H.J. Lu +Date: Mon Apr 14 15:59:47 2014 -0700 + + [android] Always enable --eh-frame-hdr for static executable + + See 5e6cdf76af295c9a39b695ca228cff675e8ff4ae and + 23e3137ee2897464b051599b85a09f130d3ad05d + + Change-Id: Ibda473188e5a10f2a0592f2494ad00ad1f91e04b + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config.in b/gcc/config.in +index 115cb6163..933916833 100644 +--- a/gcc/config.in ++++ b/gcc/config.in +@@ -2119,6 +2119,12 @@ + #endif + + ++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */ ++#ifndef USED_FOR_TARGET ++#undef USE_EH_FRAME_HDR_FOR_STATIC ++#endif ++ ++ + /* Define to 1 if the 'long long' type is wider than 'long' but still + efficiently supported by the host hardware. */ + #ifndef USED_FOR_TARGET +diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h +index 093c38bba..54b3e0c91 100644 +--- a/gcc/config/alpha/elf.h ++++ b/gcc/config/alpha/elf.h +@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number; + I imagine that other systems will catch up. In the meantime, it + doesn't harm to make sure that the data exists to be used later. */ + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif +diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h +index 5ded869d2..5f51ac81d 100644 +--- a/gcc/config/freebsd.h ++++ b/gcc/config/freebsd.h +@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see + #define LIB_SPEC FBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #ifdef TARGET_LIBC_PROVIDES_SSP + #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \ +diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h +index b0bf40a95..d1874bc29 100644 +--- a/gcc/config/gnu-user.h ++++ b/gcc/config/gnu-user.h +@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LINK_GCC_C_SEQUENCE_SPEC + #define LINK_GCC_C_SEQUENCE_SPEC \ +diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h +index 37ecfc43f..a5f1b9955 100644 +--- a/gcc/config/openbsd.h ++++ b/gcc/config/openbsd.h +@@ -136,8 +136,12 @@ while (0) + #define LIB_SPEC OBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LIB_SPEC + #define LIB_SPEC OBSD_LIB_SPEC +diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h +index cbf909722..eb2217fad 100644 +--- a/gcc/config/rs6000/sysv4.h ++++ b/gcc/config/rs6000/sysv4.h +@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN) + -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}" + + #if defined(HAVE_LD_EH_FRAME_HDR) +-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# ifdef USE_EH_FRAME_HDR_FOR_STATIC ++# define LINK_EH_SPEC "--eh-frame-hdr " ++# else ++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# endif + #endif + + #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \ +diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h +index 5160e1fda..7632a5081 100644 +--- a/gcc/config/sol2.h ++++ b/gcc/config/sol2.h +@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see + /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs + --eh-frame-hdr to create the required .eh_frame_hdr sections. */ + #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++#endif + #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */ + #endif + +diff --git a/gcc/configure b/gcc/configure +index 1c6e3407c..28ad05004 100755 +--- a/gcc/configure ++++ b/gcc/configure +@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769 + enable_fix_cortex_a53_843419 + with_glibc_version + enable_gnu_unique_object ++enable_eh_frame_hdr_for_static + enable_linker_build_id + enable_default_ssp + with_long_double_128 +@@ -1670,6 +1671,9 @@ Optional Features: + --enable-gnu-unique-object + enable the use of the @gnu_unique_object ELF + extension on glibc systems ++ --enable-eh-frame-hdr-for-static ++ enable linker PT_GNU_EH_FRAME support for static ++ executable + --enable-linker-build-id + compiler will always pass --build-id to linker + --enable-default-ssp enable Stack Smashing Protection as default +@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + + $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h + ++ # Check whether --enable-eh-frame-hdr-for-static was given. ++if test "${enable_eh_frame_hdr_for_static+set}" = set; then : ++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;; ++ esac ++else ++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi ++fi ++ ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ ++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h ++ ++ fi + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5 + $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; } +diff --git a/gcc/configure.ac b/gcc/configure.ac +index 6c1dcd9ae..0cf7419e7 100644 +--- a/gcc/configure.ac ++++ b/gcc/configure.ac +@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR]) + if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1, + [Define if your linker supports .eh_frame_hdr.]) ++ AC_ARG_ENABLE(eh-frame-hdr-for-static, ++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static], ++ [enable linker PT_GNU_EH_FRAME support for static executable])], ++ [case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'.]) ;; ++ esac], ++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ [[if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi]]) ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1, ++[Define if your system supports PT_GNU_EH_FRAME for static executable.]) ++ fi + fi + AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr) + +diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C +new file mode 100644 +index 000000000..15408effa +--- /dev/null ++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C +@@ -0,0 +1,25 @@ ++// PR c++/4381 ++// Test that exception-specs work properly for classes with virtual bases. ++ ++// { dg-do run } ++// { dg-options "-static" } ++ ++class Base {}; ++ ++struct A : virtual public Base ++{ ++ A() {} ++}; ++ ++struct B {}; ++ ++void func() throw (B,A) ++{ ++ throw A(); ++} ++ ++int main(void) ++{ ++ try { func(); } ++ catch (A& a) { } ++} +diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c +index f3343fc4f..d42647779 100644 +--- a/libgcc/crtstuff.c ++++ b/libgcc/crtstuff.c +@@ -88,7 +88,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(BSD_DL_ITERATE_PHDR_AVAILABLE) + #include + # define USE_PT_GNU_EH_FRAME +@@ -97,7 +98,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__sun__) && defined(__svr4__) + #include + # define USE_PT_GNU_EH_FRAME +@@ -106,7 +108,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__GLIBC__) && __GLIBC__ >= 2 + #include + /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h. +@@ -121,7 +124,7 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(CRTSTUFFT_O) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(inhibit_libc) \ + && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__)) + /* On systems using glibc, an inhibit_libc build of libgcc is only diff --git a/patches/gcc/6.3.0/973-crystax.patch b/patches/gcc/6.3.0/973-crystax.patch new file mode 100644 index 0000000..b96ece3 --- /dev/null +++ b/patches/gcc/6.3.0/973-crystax.patch @@ -0,0 +1,20 @@ +commit 778a9ef107f51544d583f110e92b75f4d9d79117 +Author: Dmitry Moskalchuk +Date: Thu Aug 20 19:11:07 2015 +0300 + + [android] Don't use PIE copyrelocs for x86/x86_64 + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c +index 3d044e8bd..5c89fcab0 100644 +--- a/gcc/config/i386/i386.c ++++ b/gcc/config/i386/i386.c +@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp) + else if (!SYMBOL_REF_FAR_ADDR_P (op0) + && (SYMBOL_REF_LOCAL_P (op0) + || (HAVE_LD_PIE_COPYRELOC ++ && !TARGET_HAS_BIONIC + && flag_pie + && !SYMBOL_REF_WEAK (op0) + && !SYMBOL_REF_FUNCTION_P (op0))) diff --git a/patches/gcc/6.3.0/974-crystax.patch b/patches/gcc/6.3.0/974-crystax.patch new file mode 100644 index 0000000..9db4f54 --- /dev/null +++ b/patches/gcc/6.3.0/974-crystax.patch @@ -0,0 +1,24 @@ +commit dbeae1190cabad83999f2540523f045acc1bb4ec +Author: Dmitry Moskalchuk +Date: Fri Aug 21 17:41:59 2015 +0300 + + [android] Always use gthr-posix.h instead of gthr-default.h + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/libgcc/gthr.h b/libgcc/gthr.h +index 47a7d061a..67a680f90 100644 +--- a/libgcc/gthr.h ++++ b/libgcc/gthr.h +@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define GTHREAD_USE_WEAK 1 + #endif + #endif ++#if __ANDROID__ ++#include "gthr-posix.h" ++#else + #include "gthr-default.h" ++#endif + + #ifndef HIDE_EXPORTS + #pragma GCC visibility pop diff --git a/patches/gcc/6.3.0/975-crystax.patch b/patches/gcc/6.3.0/975-crystax.patch new file mode 100644 index 0000000..9efc2a4 --- /dev/null +++ b/patches/gcc/6.3.0/975-crystax.patch @@ -0,0 +1,31 @@ +commit 8a66d422721ae5999737d7825701ff22097d287b +Author: Andrew Hsieh +Date: Mon Apr 14 21:05:51 2014 -0700 + + [android] Fix ARM generates insufficient alignment for NEON vst/vld + + See d909af3e2469aad87d5c3e79b93c778fd26c03a9 + + Change-Id: Ie1de9f946f397196bb6f1623f5add86933739484 + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c +index 5974c65d3..71b2c7aa9 100644 +--- a/gcc/config/arm/arm.c ++++ b/gcc/config/arm/arm.c +@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code) + memsize = MEM_SIZE (x); + + /* Only certain alignment specifiers are supported by the hardware. */ +- if (memsize == 32 && (align % 32) == 0) ++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC ++ honors stricter alignment of composite type in user code, it doesn't ++ observe the alignment of memory passed as an extra argument for function ++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */ ++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED) + align_bits = 256; +- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0) ++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED) + align_bits = 128; + else if (memsize >= 8 && (align % 8) == 0) + align_bits = 64; diff --git a/patches/gcc/6.3.0/976-crystax.patch b/patches/gcc/6.3.0/976-crystax.patch new file mode 100644 index 0000000..790d4a9 --- /dev/null +++ b/patches/gcc/6.3.0/976-crystax.patch @@ -0,0 +1,21 @@ +commit 89d27bc45ee7325dcfff6748da0f8b9c1dc1f234 +Author: Dmitry Moskalchuk +Date: Sat Aug 22 09:55:55 2015 +0300 + + [android][i386] Remove throw() declaration from posix_memalign() proto + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h +index a1f98d3d1..3725799be 100644 +--- a/gcc/config/i386/pmm_malloc.h ++++ b/gcc/config/i386/pmm_malloc.h +@@ -31,7 +31,7 @@ + #ifndef __cplusplus + extern int posix_memalign (void **, size_t, size_t); + #else +-extern "C" int posix_memalign (void **, size_t, size_t) throw (); ++extern "C" int posix_memalign (void **, size_t, size_t); + #endif + + static __inline void * diff --git a/patches/gcc/6.3.0/977-crystax.patch b/patches/gcc/6.3.0/977-crystax.patch new file mode 100644 index 0000000..0211d72 --- /dev/null +++ b/patches/gcc/6.3.0/977-crystax.patch @@ -0,0 +1,33 @@ +commit 9ae82f7cfc1073820092dd9f957559667e77db0d +Author: Dmitry Moskalchuk +Date: Tue Aug 25 09:36:42 2015 +0300 + + [android] Explicitly make _Unwind_Resume visible for arm64/mips64 + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +index db1288fd0..38bc64d61 100644 +--- a/gcc/config/aarch64/aarch64-linux-android.h ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -57,4 +57,8 @@ + #define ENDFILE_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) + ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif ++ + #endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8bfacf994..262a9a341 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -63,3 +63,7 @@ along with GCC; see the file COPYING3. If not see + + /* The default value isn't sufficient in 64-bit mode. */ + #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) ++ ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif diff --git a/patches/gcc/7.1.0/951-bionic-ndk.patch b/patches/gcc/7.1.0/951-bionic-ndk.patch new file mode 100644 index 0000000..59c50a8 --- /dev/null +++ b/patches/gcc/7.1.0/951-bionic-ndk.patch @@ -0,0 +1,58 @@ +commit d38d37bdfe24b7ce1bdcb55642fb6b904718e68f +Author: Howard Chu +Date: Tue Apr 25 19:02:18 2017 -0700 + + Fix ctype for newer NDK headers + +diff --git a/libstdc++-v3/config/os/bionic/ctype_base.h b/libstdc++-v3/config/os/bionic/ctype_base.h +index 33978f3..c36e63c 100644 +--- a/libstdc++-v3/config/os/bionic/ctype_base.h ++++ b/libstdc++-v3/config/os/bionic/ctype_base.h +@@ -28,6 +28,18 @@ + + // Information as gleaned from /usr/include/ctype.h + ++// _CTYPE prefix was added in NDK r14 unified headers ++#ifndef _CTYPE_U ++#define _CTYPE_U _U ++#define _CTYPE_L _L ++#define _CTYPE_D _N ++#define _CTYPE_S _S ++#define _CTYPE_P _P ++#define _CTYPE_C _C ++#define _CTYPE_X _X ++#define _CTYPE_B _B ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -41,17 +53,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + // NB: Offsets into ctype::_M_table force a particular size + // on the mask type. Because of this, we don't use an enum. + typedef char mask; +- static const mask upper = _U; +- static const mask lower = _L; +- static const mask alpha = _U | _L; +- static const mask digit = _N; +- static const mask xdigit = _X | _N; +- static const mask space = _S; +- static const mask print = _P | _U | _L | _N | _B; +- static const mask graph = _P | _U | _L | _N; +- static const mask cntrl = _C; +- static const mask punct = _P; +- static const mask alnum = _U | _L | _N; ++ static const mask upper = _CTYPE_U; ++ static const mask lower = _CTYPE_L; ++ static const mask alpha = _CTYPE_U | _CTYPE_L; ++ static const mask digit = _CTYPE_D; ++ static const mask xdigit = _CTYPE_X | _CTYPE_D; ++ static const mask space = _CTYPE_S; ++ static const mask print = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D | _CTYPE_B; ++ static const mask graph = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D; ++ static const mask cntrl = _CTYPE_C; ++ static const mask punct = _CTYPE_P; ++ static const mask alnum = _CTYPE_U | _CTYPE_L | _CTYPE_D; + #if __cplusplus >= 201103L + static const mask blank = space; + #endif diff --git a/patches/gcc/7.1.0/952-bionic-errno.patch b/patches/gcc/7.1.0/952-bionic-errno.patch new file mode 100644 index 0000000..91f6ca3 --- /dev/null +++ b/patches/gcc/7.1.0/952-bionic-errno.patch @@ -0,0 +1,19 @@ +commit 6cd4ad106ef87a3c58b0c3478e78409b47000de0 +Author: Howard Chu +Date: Tue Apr 25 20:17:03 2017 -0700 + + Fix, errno is volatile int + +diff --git a/libstdc++-v3/src/filesystem/dir.cc b/libstdc++-v3/src/filesystem/dir.cc +index 6ff12d0..5bbd664 100644 +--- a/libstdc++-v3/src/filesystem/dir.cc ++++ b/libstdc++-v3/src/filesystem/dir.cc +@@ -147,7 +147,7 @@ fs::_Dir::advance(error_code* ec, directory_options options) + + int err = std::exchange(errno, 0); + const auto entp = readdir(dirp); +- std::swap(errno, err); ++ std::swap((int&)errno, err); + + if (entp) + { diff --git a/patches/gcc/7.1.0/970-crystax.patch b/patches/gcc/7.1.0/970-crystax.patch new file mode 100644 index 0000000..7324d7f --- /dev/null +++ b/patches/gcc/7.1.0/970-crystax.patch @@ -0,0 +1,553 @@ +commit 080803512c8f6f87c2f1f711170d54033144d628 +Author: Dmitry Moskalchuk +Date: Wed Jul 29 11:28:29 2015 +0300 + + [android] Apply Android-related modifications + + Signed-off-by: Dmitry Moskalchuk + +[Edited: keep libstdc++, drop libcrystax-related modifications] +diff --git a/gcc/config.gcc b/gcc/config.gcc +index f66e48cd1..1c253496b 100644 +--- a/gcc/config.gcc ++++ b/gcc/config.gcc +@@ -942,13 +942,17 @@ aarch64*-*-elf | aarch64*-*-rtems*) + TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'` + ;; + aarch64*-*-linux*) +- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h" ++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h" + tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h" ++ extra_options="${extra_options} linux-android.opt" + tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux" + case $target in + aarch64_be-*) + tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1" + ;; ++ aarch64*-*-linux-android*) ++ tm_file="${tm_file} aarch64/aarch64-linux-android.h" ++ ;; + esac + aarch64_multilibs="${with_multilib_list}" + if test "$aarch64_multilibs" = "default"; then +@@ -2055,6 +2059,17 @@ mips*-*-linux*) # Linux MIPS, either endian. + tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h" + extra_options="${extra_options} linux-android.opt" + case ${target} in ++ mips64*android*) ++ default_mips_arch=mips64r6 ++ default_mips_abi=64 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="${tmake_file} mips/t-linux-android64" ++ ;; ++ mips*android*) ++ default_mips_arch=mips32 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="$tmake_file mips/t-linux-android" ++ ;; + mipsisa32r6*) + default_mips_arch=mips32r6 + ;; +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +new file mode 100644 +index 000000000..db1288fd0 +--- /dev/null ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -0,0 +1,59 @@ ++/* Machine description for AArch64 architecture. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ GCC is distributed in the hope that it will be useful, but ++ WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING3. If not see ++ . */ ++ ++#ifndef GCC_AARCH64_LINUX_ANDROID_H ++#define GCC_AARCH64_LINUX_ANDROID_H ++ ++ ++#undef TARGET_OS_CPP_BUILTINS ++#define TARGET_OS_CPP_BUILTINS() \ ++ do \ ++ { \ ++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ ++ } \ ++ while (0) ++ ++#undef LINK_SPEC ++#define LINK_SPEC \ ++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ ++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) ++ ++#undef CC1_SPEC ++#define CC1_SPEC \ ++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) ++ ++#undef LIB_SPEC ++#define LIB_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ ++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC) ++ ++#undef STARTFILE_SPEC ++#define STARTFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) ++ ++#undef ENDFILE_SPEC ++#define ENDFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) ++ ++#endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h +index 5fcaa59a3..6864195ee 100644 +--- a/gcc/config/aarch64/aarch64-linux.h ++++ b/gcc/config/aarch64/aarch64-linux.h +@@ -21,7 +21,14 @@ + #ifndef GCC_AARCH64_LINUX_H + #define GCC_AARCH64_LINUX_H + +-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifndef RUNTIME_ROOT_PREFIX ++#define RUNTIME_ROOT_PREFIX "" ++#endif ++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifdef BIONIC_DYNAMIC_LINKER ++#undef BIONIC_DYNAMIC_LINKER ++#endif ++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64" + + #undef MUSL_DYNAMIC_LINKER + #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" +diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h +index ad123dde9..97b059de6 100644 +--- a/gcc/config/arm/arm.h ++++ b/gcc/config/arm/arm.h +@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes + + #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ + || (TARGET_THUMB1 \ ++ && !inline_thumb1_jump_table \ + && (optimize_size || flag_pic))) + + #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ +- (TARGET_THUMB1 \ ++ (TARGET_THUMB1 && !inline_thumb1_jump_table \ + ? (min >= 0 && max < 512 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ + : min >= -256 && max < 256 \ +diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md +index 47171b996..eb22d1181 100644 +--- a/gcc/config/arm/arm.md ++++ b/gcc/config/arm/arm.md +@@ -8179,7 +8179,7 @@ + (match_operand:SI 2 "const_int_operand" "") ; total range + (match_operand:SI 3 "" "") ; table label + (match_operand:SI 4 "" "")] ; Out of range label +- "(TARGET_32BIT || optimize_size || flag_pic) && !target_pure_code" ++ "(TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)) && !target_pure_code" + " + { + enum insn_code code; +diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt +index 0ebe01743..772453889 100644 +--- a/gcc/config/arm/arm.opt ++++ b/gcc/config/arm/arm.opt +@@ -193,6 +193,10 @@ mthumb-interwork + Target Report Mask(INTERWORK) + Support calls between Thumb and ARM instruction sets. + ++minline-thumb1-jumptable ++Target Report Var(inline_thumb1_jump_table) ++Inline Thumb1 Jump table code ++ + mtls-dialect= + Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU) + Specify thread local storage scheme. +diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h +index 77f30554d..32158ed65 100644 +--- a/gcc/config/arm/elf.h ++++ b/gcc/config/arm/elf.h +@@ -56,8 +56,7 @@ + #undef SUBSUBTARGET_EXTRA_SPECS + #define SUBSUBTARGET_EXTRA_SPECS + +-#ifndef ASM_SPEC +-#define ASM_SPEC "\ ++#define DEFAULT_ASM_SPEC "\ + %{mbig-endian:-EB} \ + %{mlittle-endian:-EL} \ + %(asm_cpu_spec) \ +@@ -66,6 +65,9 @@ + %{mthumb-interwork:-mthumb-interwork} \ + %{mfloat-abi=*} %{mfpu=*} \ + %(subtarget_extra_asm_spec)" ++ ++#ifndef ASM_SPEC ++#define ASM_SPEC DEFAULT_ASM_SPEC + #endif + + /* The ARM uses @ are a comment character so we need to redefine +@@ -104,8 +106,9 @@ + the code more efficient, but for Thumb-1 it's better to put them out of + band unless we are generating compressed tables. */ + #define JUMP_TABLES_IN_TEXT_SECTION \ +- ((TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) \ +- && !target_pure_code) ++ ((TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \ ++ && (optimize_size || flag_pic))) \ ++ && !target_pure_code) + + #ifndef LINK_SPEC + #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" +diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h +index ace84816e..8c8fa6553 100644 +--- a/gcc/config/arm/linux-eabi.h ++++ b/gcc/config/arm/linux-eabi.h +@@ -108,11 +108,16 @@ + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \ + GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \ +- ANDROID_CC1_SPEC) ++ ANDROID_CC1_SPEC("-fpic")) + + #define CC1PLUS_SPEC \ + LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + ++#undef ASM_SPEC ++#define ASM_SPEC \ ++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \ ++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC) ++ + #undef LIB_SPEC + #define LIB_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ +diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h +index fee33a3ef..22fb2ced9 100644 +--- a/gcc/config/i386/gnu-user.h ++++ b/gcc/config/i386/gnu-user.h +@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see + When the -shared link option is used a final link is not being + done. */ + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ " -mssse3 -fno-short-enums " \ ++ + #undef ASM_SPEC + #define ASM_SPEC \ +- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" ++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + #undef SUBTARGET_EXTRA_SPECS + #define SUBTARGET_EXTRA_SPECS \ +diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h +index 7a02a7eb4..cac4179bc 100644 +--- a/gcc/config/i386/gnu-user64.h ++++ b/gcc/config/i386/gnu-user64.h +@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define SPEC_X32 "mx32" + #endif + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ "%{m32:-mssse3 -fno-short-enums}" \ ++ "%{!m32:-msse4.2 -mpopcnt}" ++ + #undef ASM_SPEC + #define ASM_SPEC "%{" SPEC_32 ":--32} \ + %{" SPEC_64 ":--64} \ +diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h +index 4b9910fa9..3b11ed086 100644 +--- a/gcc/config/i386/linux-common.h ++++ b/gcc/config/i386/linux-common.h +@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see + #undef CC1_SPEC + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC \ ++ ANDROID_TARGET_CC1_SPEC \ ++ " " \ ++ ANDROID_CC1_SPEC("-fPIC")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + + #undef LINK_SPEC + #define LINK_SPEC \ +diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h +index 301a41ccd..9623c88d0 100644 +--- a/gcc/config/linux-android.h ++++ b/gcc/config/linux-android.h +@@ -38,15 +39,18 @@ + "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}" + + #define ANDROID_LINK_SPEC \ +- "%{shared: -Bsymbolic}" ++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now" + +-#define ANDROID_CC1_SPEC \ ++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \ + "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \ +- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}" ++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}" + + #define ANDROID_CC1PLUS_SPEC \ +- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \ +- "%{!frtti:%{!fno-rtti: -fno-rtti}}" ++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \ ++ "%{!frtti:%{!fno-rtti: -frtti}}" ++ ++#define ANDROID_ASM_SPEC \ ++ "--noexecstack" + + #define ANDROID_LIB_SPEC \ + "%{!static: -ldl}" +diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h +new file mode 100644 +index 000000000..32c539c8d +--- /dev/null ++++ b/gcc/config/mips/android.h +@@ -0,0 +1,49 @@ ++/* Target macros for mips*-*android* targets. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; either version 3, or (at your option) ++any later version. ++ ++GCC is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#undef DRIVER_SELF_SPECS ++#define DRIVER_SELF_SPECS \ ++ /* Make sure a -mips option is present. This helps us to pick \ ++ the right multilib, and also makes the later specs easier \ ++ to write. */ \ ++ MIPS_ISA_LEVEL_SPEC, \ ++ \ ++ /* Infer the default float setting from -march. */ \ ++ MIPS_ARCH_FLOAT_SPEC, \ ++ \ ++ /* Infer the -msynci setting from -march if not explicitly set. */ \ ++ MIPS_ISA_SYNCI_SPEC, \ ++ \ ++ /* If no ABI option is specified, infer one from the ISA level \ ++ or -mgp setting. */ \ ++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \ ++ \ ++ /* If no FP ABI option is specified, infer one from the \ ++ ABI/ISA level unless there is a conflicting option. */ \ ++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \ ++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \ ++ \ ++ /* If no odd-spreg option is specified, infer one from the ISA. */ \ ++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \ ++ \ ++ /* Base SPECs. */ \ ++ BASE_DRIVER_SELF_SPECS, \ ++ \ ++ /* Use the standard linux specs for everything else. */ \ ++ LINUX_DRIVER_SELF_SPECS +diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h +index 15b549c08..4a2816014 100644 +--- a/gcc/config/mips/gnu-user.h ++++ b/gcc/config/mips/gnu-user.h +@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see + /* The GNU C++ standard library requires this. */ \ + if (c_dialect_cxx ()) \ + builtin_define ("_GNU_SOURCE"); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ + } while (0) + + #undef SUBTARGET_CPP_SPEC +@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see + + #undef SUBTARGET_ASM_SPEC + #define SUBTARGET_ASM_SPEC \ +- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}" ++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + /* The MIPS assembler has different syntax for .set. We set it to + .dummy to trap any errors. */ +@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); + #endif + + #define LINUX_DRIVER_SELF_SPECS \ +- NO_SHARED_SPECS \ ++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \ + MARCH_MTUNE_NATIVE_SPECS, \ + /* -mplt has no effect without -mno-shared. Simplify later \ + specs handling by removing a redundant option. */ \ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8429a7ca2..8bfacf994 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see + #undef SUBTARGET_CC1_SPEC + #define SUBTARGET_CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) + + #undef CC1PLUS_SPEC + #define CC1PLUS_SPEC \ +diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android +new file mode 100644 +index 000000000..39f512c81 +--- /dev/null ++++ b/gcc/config/mips/t-linux-android +@@ -0,0 +1,3 @@ ++MULTILIB_OPTIONS = mips32r2/mips32r6 ++MULTILIB_DIRNAMES = mips-r2 mips-r6 ++MULTILIB_OSDIRNAMES = ../libr2 ../libr6 +diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64 +new file mode 100644 +index 000000000..55cab7d62 +--- /dev/null ++++ b/gcc/config/mips/t-linux-android64 +@@ -0,0 +1,4 @@ ++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6 ++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6 ++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64 ++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6 +diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h +index 555c0fe24..47c8655f9 100644 +--- a/libgcc/gthr-posix.h ++++ b/libgcc/gthr-posix.h +@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define __GTHREADS 1 + #define __GTHREADS_CXX0X 1 + ++/* The following should normally be in a different header file, ++ * but I couldn't find the right location. The point of the macro ++ * definition below is to prevent libsupc++ and libstdc++ to reference ++ * weak symbols in their static C++ constructors. Such code crashes ++ * when a shared object linked statically to these libraries is ++ * loaded on Android 2.1 (Eclair) and older platform releases, due ++ * to a dynamic linker bug. ++ */ ++#ifdef __ANDROID__ ++#undef GTHREAD_USE_WEAK ++#define GTHREAD_USE_WEAK 0 ++#endif ++ + #include + + #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \ +diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure +index 41797a971..f746e8353 100755 +--- a/libstdc++-v3/configure ++++ b/libstdc++-v3/configure +@@ -78319,6 +78341,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +@@ -78377,6 +78405,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h +index e3e206b7d..e85dc2c76 100644 +--- a/libstdc++-v3/include/bits/locale_facets.h ++++ b/libstdc++-v3/include/bits/locale_facets.h +@@ -47,6 +47,20 @@ + #include + #include + ++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__ ++// CrystaX: for some reason, x86 gcc-4.9 makes ctype::do_widen() and ++// ctype::_M_widen_init() methods working wrong if optimization enabled. ++// For ctype::do_widen(), values of passed arguments (__lo, __hi and __to) ++// are completely messed up and don't correspond to passed values. In case if ++// we disable optimization for those methods, things become correct so we apply ++// this workaround here for a time. ++// TODO: figure out what exactly wrong here - is it bug in GCC optimization ++// algorithm or smth else? ++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0))) ++#else ++#define __CRYSTAX_X86_DONT_OPTIMIZE ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + * @return @a __hi. + */ + virtual const char* +- do_widen(const char* __lo, const char* __hi, char_type* __to) const ++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE + { + __builtin_memcpy(__to, __lo, __hi - __lo); + return __hi; +@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + + private: + void _M_narrow_init() const; +- void _M_widen_init() const; ++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE; + }; + + #ifdef _GLIBCXX_USE_WCHAR_T +diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc +index 9b617998f..c149169bb 100644 +--- a/libstdc++-v3/libsupc++/guard.cc ++++ b/libstdc++-v3/libsupc++/guard.cc +@@ -33,7 +33,12 @@ + #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \ + && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX) + # include ++#if defined(__ANDROID__) ++# include ++# define SYS_futex __NR_futex ++#else + # include ++#endif + # include + # define _GLIBCXX_USE_FUTEX + # define _GLIBCXX_FUTEX_WAIT 0 diff --git a/patches/gcc/7.1.0/971-crystax.patch b/patches/gcc/7.1.0/971-crystax.patch new file mode 100644 index 0000000..748a381 --- /dev/null +++ b/patches/gcc/7.1.0/971-crystax.patch @@ -0,0 +1,25 @@ +commit 9f057b62caafe08c968103d39b5df82486a175c2 +Author: Dmitry Moskalchuk +Date: Thu Aug 13 16:11:54 2015 +0300 + + [android] Add additional multilib option: mfloat-abi=hard + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi +index 8f1307c55..cbbec5bd2 100644 +--- a/gcc/config/arm/t-linux-androideabi ++++ b/gcc/config/arm/t-linux-androideabi +@@ -1,8 +1,9 @@ +-MULTILIB_OPTIONS = march=armv7-a mthumb +-MULTILIB_DIRNAMES = armv7-a thumb +-MULTILIB_EXCEPTIONS = ++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard ++MULTILIB_DIRNAMES = armv7-a thumb hard ++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard* + MULTILIB_MATCHES = + MULTILIB_OSDIRNAMES = ++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch + + # The "special" multilib can be used to build native applications for Android, + # as opposed to native shared libraries that are then called via JNI. diff --git a/patches/gcc/7.1.0/972-crystax.patch b/patches/gcc/7.1.0/972-crystax.patch new file mode 100644 index 0000000..b9077be --- /dev/null +++ b/patches/gcc/7.1.0/972-crystax.patch @@ -0,0 +1,302 @@ +commit 44a81ebb7698dac41ffa7acd5e0cc1578e5ab1fd +Author: H.J. Lu +Date: Mon Apr 14 15:59:47 2014 -0700 + + [android] Always enable --eh-frame-hdr for static executable + + See 5e6cdf76af295c9a39b695ca228cff675e8ff4ae and + 23e3137ee2897464b051599b85a09f130d3ad05d + + Change-Id: Ibda473188e5a10f2a0592f2494ad00ad1f91e04b + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config.in b/gcc/config.in +index 115cb6163..933916833 100644 +--- a/gcc/config.in ++++ b/gcc/config.in +@@ -2119,6 +2119,12 @@ + #endif + + ++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */ ++#ifndef USED_FOR_TARGET ++#undef USE_EH_FRAME_HDR_FOR_STATIC ++#endif ++ ++ + /* Define to 1 if the 'long long' type is wider than 'long' but still + efficiently supported by the host hardware. */ + #ifndef USED_FOR_TARGET +diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h +index 093c38bba..54b3e0c91 100644 +--- a/gcc/config/alpha/elf.h ++++ b/gcc/config/alpha/elf.h +@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number; + I imagine that other systems will catch up. In the meantime, it + doesn't harm to make sure that the data exists to be used later. */ + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif +diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h +index 5ded869d2..5f51ac81d 100644 +--- a/gcc/config/freebsd.h ++++ b/gcc/config/freebsd.h +@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see + #define LIB_SPEC FBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #ifdef TARGET_LIBC_PROVIDES_SSP + #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \ +diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h +index b0bf40a95..d1874bc29 100644 +--- a/gcc/config/gnu-user.h ++++ b/gcc/config/gnu-user.h +@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LINK_GCC_C_SEQUENCE_SPEC + #define LINK_GCC_C_SEQUENCE_SPEC \ +diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h +index 37ecfc43f..a5f1b9955 100644 +--- a/gcc/config/openbsd.h ++++ b/gcc/config/openbsd.h +@@ -136,8 +136,12 @@ while (0) + #define LIB_SPEC OBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LIB_SPEC + #define LIB_SPEC OBSD_LIB_SPEC +diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h +index cbf909722..eb2217fad 100644 +--- a/gcc/config/rs6000/sysv4.h ++++ b/gcc/config/rs6000/sysv4.h +@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN) + -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}" + + #if defined(HAVE_LD_EH_FRAME_HDR) +-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# ifdef USE_EH_FRAME_HDR_FOR_STATIC ++# define LINK_EH_SPEC "--eh-frame-hdr " ++# else ++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# endif + #endif + + #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \ +diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h +index 5160e1fda..7632a5081 100644 +--- a/gcc/config/sol2.h ++++ b/gcc/config/sol2.h +@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see + /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs + --eh-frame-hdr to create the required .eh_frame_hdr sections. */ + #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++#endif + #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */ + #endif + +diff --git a/gcc/configure b/gcc/configure +index 1c6e3407c..28ad05004 100755 +--- a/gcc/configure ++++ b/gcc/configure +@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769 + enable_fix_cortex_a53_843419 + with_glibc_version + enable_gnu_unique_object ++enable_eh_frame_hdr_for_static + enable_linker_build_id + enable_default_ssp + with_long_double_128 +@@ -1670,6 +1671,9 @@ Optional Features: + --enable-gnu-unique-object + enable the use of the @gnu_unique_object ELF + extension on glibc systems ++ --enable-eh-frame-hdr-for-static ++ enable linker PT_GNU_EH_FRAME support for static ++ executable + --enable-linker-build-id + compiler will always pass --build-id to linker + --enable-default-ssp enable Stack Smashing Protection as default +@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + + $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h + ++ # Check whether --enable-eh-frame-hdr-for-static was given. ++if test "${enable_eh_frame_hdr_for_static+set}" = set; then : ++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;; ++ esac ++else ++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi ++fi ++ ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ ++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h ++ ++ fi + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5 + $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; } +diff --git a/gcc/configure.ac b/gcc/configure.ac +index 6c1dcd9ae..0cf7419e7 100644 +--- a/gcc/configure.ac ++++ b/gcc/configure.ac +@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR]) + if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1, + [Define if your linker supports .eh_frame_hdr.]) ++ AC_ARG_ENABLE(eh-frame-hdr-for-static, ++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static], ++ [enable linker PT_GNU_EH_FRAME support for static executable])], ++ [case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'.]) ;; ++ esac], ++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ [[if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi]]) ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1, ++[Define if your system supports PT_GNU_EH_FRAME for static executable.]) ++ fi + fi + AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr) + +diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C +new file mode 100644 +index 000000000..15408effa +--- /dev/null ++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C +@@ -0,0 +1,25 @@ ++// PR c++/4381 ++// Test that exception-specs work properly for classes with virtual bases. ++ ++// { dg-do run } ++// { dg-options "-static" } ++ ++class Base {}; ++ ++struct A : virtual public Base ++{ ++ A() {} ++}; ++ ++struct B {}; ++ ++void func() throw (B,A) ++{ ++ throw A(); ++} ++ ++int main(void) ++{ ++ try { func(); } ++ catch (A& a) { } ++} +diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c +index f3343fc4f..d42647779 100644 +--- a/libgcc/crtstuff.c ++++ b/libgcc/crtstuff.c +@@ -88,7 +88,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(BSD_DL_ITERATE_PHDR_AVAILABLE) + #include + # define USE_PT_GNU_EH_FRAME +@@ -97,7 +98,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__sun__) && defined(__svr4__) + #include + # define USE_PT_GNU_EH_FRAME +@@ -106,7 +108,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__GLIBC__) && __GLIBC__ >= 2 + #include + /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h. +@@ -121,7 +124,7 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(CRTSTUFFT_O) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(inhibit_libc) \ + && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__)) + /* On systems using glibc, an inhibit_libc build of libgcc is only diff --git a/patches/gcc/7.1.0/973-crystax.patch b/patches/gcc/7.1.0/973-crystax.patch new file mode 100644 index 0000000..b96ece3 --- /dev/null +++ b/patches/gcc/7.1.0/973-crystax.patch @@ -0,0 +1,20 @@ +commit 778a9ef107f51544d583f110e92b75f4d9d79117 +Author: Dmitry Moskalchuk +Date: Thu Aug 20 19:11:07 2015 +0300 + + [android] Don't use PIE copyrelocs for x86/x86_64 + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c +index 3d044e8bd..5c89fcab0 100644 +--- a/gcc/config/i386/i386.c ++++ b/gcc/config/i386/i386.c +@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp) + else if (!SYMBOL_REF_FAR_ADDR_P (op0) + && (SYMBOL_REF_LOCAL_P (op0) + || (HAVE_LD_PIE_COPYRELOC ++ && !TARGET_HAS_BIONIC + && flag_pie + && !SYMBOL_REF_WEAK (op0) + && !SYMBOL_REF_FUNCTION_P (op0))) diff --git a/patches/gcc/7.1.0/974-crystax.patch b/patches/gcc/7.1.0/974-crystax.patch new file mode 100644 index 0000000..9db4f54 --- /dev/null +++ b/patches/gcc/7.1.0/974-crystax.patch @@ -0,0 +1,24 @@ +commit dbeae1190cabad83999f2540523f045acc1bb4ec +Author: Dmitry Moskalchuk +Date: Fri Aug 21 17:41:59 2015 +0300 + + [android] Always use gthr-posix.h instead of gthr-default.h + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/libgcc/gthr.h b/libgcc/gthr.h +index 47a7d061a..67a680f90 100644 +--- a/libgcc/gthr.h ++++ b/libgcc/gthr.h +@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define GTHREAD_USE_WEAK 1 + #endif + #endif ++#if __ANDROID__ ++#include "gthr-posix.h" ++#else + #include "gthr-default.h" ++#endif + + #ifndef HIDE_EXPORTS + #pragma GCC visibility pop diff --git a/patches/gcc/7.1.0/975-crystax.patch b/patches/gcc/7.1.0/975-crystax.patch new file mode 100644 index 0000000..9efc2a4 --- /dev/null +++ b/patches/gcc/7.1.0/975-crystax.patch @@ -0,0 +1,31 @@ +commit 8a66d422721ae5999737d7825701ff22097d287b +Author: Andrew Hsieh +Date: Mon Apr 14 21:05:51 2014 -0700 + + [android] Fix ARM generates insufficient alignment for NEON vst/vld + + See d909af3e2469aad87d5c3e79b93c778fd26c03a9 + + Change-Id: Ie1de9f946f397196bb6f1623f5add86933739484 + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c +index 5974c65d3..71b2c7aa9 100644 +--- a/gcc/config/arm/arm.c ++++ b/gcc/config/arm/arm.c +@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code) + memsize = MEM_SIZE (x); + + /* Only certain alignment specifiers are supported by the hardware. */ +- if (memsize == 32 && (align % 32) == 0) ++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC ++ honors stricter alignment of composite type in user code, it doesn't ++ observe the alignment of memory passed as an extra argument for function ++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */ ++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED) + align_bits = 256; +- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0) ++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED) + align_bits = 128; + else if (memsize >= 8 && (align % 8) == 0) + align_bits = 64; diff --git a/patches/gcc/7.1.0/976-crystax.patch b/patches/gcc/7.1.0/976-crystax.patch new file mode 100644 index 0000000..790d4a9 --- /dev/null +++ b/patches/gcc/7.1.0/976-crystax.patch @@ -0,0 +1,21 @@ +commit 89d27bc45ee7325dcfff6748da0f8b9c1dc1f234 +Author: Dmitry Moskalchuk +Date: Sat Aug 22 09:55:55 2015 +0300 + + [android][i386] Remove throw() declaration from posix_memalign() proto + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h +index a1f98d3d1..3725799be 100644 +--- a/gcc/config/i386/pmm_malloc.h ++++ b/gcc/config/i386/pmm_malloc.h +@@ -31,7 +31,7 @@ + #ifndef __cplusplus + extern int posix_memalign (void **, size_t, size_t); + #else +-extern "C" int posix_memalign (void **, size_t, size_t) throw (); ++extern "C" int posix_memalign (void **, size_t, size_t); + #endif + + static __inline void * diff --git a/patches/gcc/7.1.0/977-crystax.patch b/patches/gcc/7.1.0/977-crystax.patch new file mode 100644 index 0000000..0211d72 --- /dev/null +++ b/patches/gcc/7.1.0/977-crystax.patch @@ -0,0 +1,33 @@ +commit 9ae82f7cfc1073820092dd9f957559667e77db0d +Author: Dmitry Moskalchuk +Date: Tue Aug 25 09:36:42 2015 +0300 + + [android] Explicitly make _Unwind_Resume visible for arm64/mips64 + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +index db1288fd0..38bc64d61 100644 +--- a/gcc/config/aarch64/aarch64-linux-android.h ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -57,4 +57,8 @@ + #define ENDFILE_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) + ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif ++ + #endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8bfacf994..262a9a341 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -63,3 +63,7 @@ along with GCC; see the file COPYING3. If not see + + /* The default value isn't sufficient in 64-bit mode. */ + #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) ++ ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif diff --git a/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch b/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch deleted file mode 100644 index 1ae1186..0000000 --- a/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch +++ /dev/null @@ -1,17570 +0,0 @@ -Based on https://github.com/crystax/android-toolchain-gcc-6/commits/master - -Up to commit e510dbdc79714512ac87126f1832f366d56623b6 but with -Crystax-specific lib support omitted -diff --git a/gcc/config.gcc b/gcc/config.gcc -index f66e48c..4380657 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -374,6 +374,7 @@ i[34567]86-*-*) - rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h - adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h - avx512cdintrin.h avx512erintrin.h avx512pfintrin.h -+ arm_neon.h - shaintrin.h clflushoptintrin.h xsavecintrin.h - xsavesintrin.h avx512dqintrin.h avx512bwintrin.h - avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h -@@ -396,6 +397,7 @@ x86_64-*-*) - rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h - adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h - avx512cdintrin.h avx512erintrin.h avx512pfintrin.h -+ arm_neon.h - shaintrin.h clflushoptintrin.h xsavecintrin.h - xsavesintrin.h avx512dqintrin.h avx512bwintrin.h - avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h -@@ -942,13 +944,17 @@ aarch64*-*-elf | aarch64*-*-rtems*) - TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'` - ;; - aarch64*-*-linux*) -- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h" -+ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h" - tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h" -+ extra_options="${extra_options} linux-android.opt" - tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux" - case $target in - aarch64_be-*) - tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1" - ;; -+ aarch64*-*-linux-android*) -+ tm_file="${tm_file} aarch64/aarch64-linux-android.h" -+ ;; - esac - aarch64_multilibs="${with_multilib_list}" - if test "$aarch64_multilibs" = "default"; then -@@ -2055,6 +2061,17 @@ mips*-*-linux*) # Linux MIPS, either endian. - tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h" - extra_options="${extra_options} linux-android.opt" - case ${target} in -+ mips64*android*) -+ default_mips_arch=mips64r6 -+ default_mips_abi=64 -+ tm_file="${tm_file} mips/android.h" -+ tmake_file="${tmake_file} mips/t-linux-android64" -+ ;; -+ mips*android*) -+ default_mips_arch=mips32 -+ tm_file="${tm_file} mips/android.h" -+ tmake_file="$tmake_file mips/t-linux-android" -+ ;; - mipsisa32r6*) - default_mips_arch=mips32r6 - ;; -diff --git a/gcc/config.in b/gcc/config.in -index 115cb61..9339168 100644 ---- a/gcc/config.in -+++ b/gcc/config.in -@@ -2119,6 +2119,12 @@ - #endif - - -+/* Define if your system supports PT_GNU_EH_FRAME for static executable. */ -+#ifndef USED_FOR_TARGET -+#undef USE_EH_FRAME_HDR_FOR_STATIC -+#endif -+ -+ - /* Define to 1 if the 'long long' type is wider than 'long' but still - efficiently supported by the host hardware. */ - #ifndef USED_FOR_TARGET -diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h -new file mode 100644 -index 0000000..cffd84d ---- /dev/null -+++ b/gcc/config/aarch64/aarch64-linux-android.h -@@ -0,0 +1,63 @@ -+/* Machine description for AArch64 architecture. -+ Copyright (C) 2014 Free Software Foundation, Inc. -+ -+ This file is part of GCC. -+ -+ GCC is free software; you can redistribute it and/or modify it -+ under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 3, or (at your option) -+ any later version. -+ -+ GCC is distributed in the hope that it will be useful, but -+ WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with GCC; see the file COPYING3. If not see -+ . */ -+ -+#ifndef GCC_AARCH64_LINUX_ANDROID_H -+#define GCC_AARCH64_LINUX_ANDROID_H -+ -+ -+#undef TARGET_OS_CPP_BUILTINS -+#define TARGET_OS_CPP_BUILTINS() \ -+ do \ -+ { \ -+ GNU_USER_TARGET_OS_CPP_BUILTINS(); \ -+ ANDROID_TARGET_OS_CPP_BUILTINS(); \ -+ } \ -+ while (0) -+ -+#undef LINK_SPEC -+#define LINK_SPEC \ -+ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ -+ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) -+ -+#undef CC1_SPEC -+#define CC1_SPEC \ -+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ -+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) -+ -+#define CC1PLUS_SPEC \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) -+ -+#undef LIB_SPEC -+#define LIB_SPEC \ -+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ -+ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC) -+ -+#undef STARTFILE_SPEC -+#define STARTFILE_SPEC \ -+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) -+ -+#undef ENDFILE_SPEC -+#define ENDFILE_SPEC \ -+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) -+ -+#ifdef IN_LIBGCC2 -+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) -+#endif -+ -+#endif /* GCC_AARCH64_LINUX_ANDROID_H */ -diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h -index 5fcaa59..6864195 100644 ---- a/gcc/config/aarch64/aarch64-linux.h -+++ b/gcc/config/aarch64/aarch64-linux.h -@@ -21,7 +21,14 @@ - #ifndef GCC_AARCH64_LINUX_H - #define GCC_AARCH64_LINUX_H - --#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" -+#ifndef RUNTIME_ROOT_PREFIX -+#define RUNTIME_ROOT_PREFIX "" -+#endif -+#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" -+#ifdef BIONIC_DYNAMIC_LINKER -+#undef BIONIC_DYNAMIC_LINKER -+#endif -+#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64" - - #undef MUSL_DYNAMIC_LINKER - #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" -diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h -index 093c38b..54b3e0c 100644 ---- a/gcc/config/alpha/elf.h -+++ b/gcc/config/alpha/elf.h -@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number; - I imagine that other systems will catch up. In the meantime, it - doesn't harm to make sure that the data exists to be used later. */ - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif -diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c -index 5974c65..71b2c7a 100644 ---- a/gcc/config/arm/arm.c -+++ b/gcc/config/arm/arm.c -@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code) - memsize = MEM_SIZE (x); - - /* Only certain alignment specifiers are supported by the hardware. */ -- if (memsize == 32 && (align % 32) == 0) -+ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC -+ honors stricter alignment of composite type in user code, it doesn't -+ observe the alignment of memory passed as an extra argument for function -+ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */ -+ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED) - align_bits = 256; -- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0) -+ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED) - align_bits = 128; - else if (memsize >= 8 && (align % 8) == 0) - align_bits = 64; -diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h -index ad123dd..97b059d 100644 ---- a/gcc/config/arm/arm.h -+++ b/gcc/config/arm/arm.h -@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes - - #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ - || (TARGET_THUMB1 \ -+ && !inline_thumb1_jump_table \ - && (optimize_size || flag_pic))) - - #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ -- (TARGET_THUMB1 \ -+ (TARGET_THUMB1 && !inline_thumb1_jump_table \ - ? (min >= 0 && max < 512 \ - ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ - : min >= -256 && max < 256 \ -diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md -index 47171b9..eb22d11 100644 ---- a/gcc/config/arm/arm.md -+++ b/gcc/config/arm/arm.md -@@ -8179,7 +8179,7 @@ - (match_operand:SI 2 "const_int_operand" "") ; total range - (match_operand:SI 3 "" "") ; table label - (match_operand:SI 4 "" "")] ; Out of range label -- "TARGET_32BIT || optimize_size || flag_pic" -+ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)" - " - { - enum insn_code code; -diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt -index 0ebe017..7724538 100644 ---- a/gcc/config/arm/arm.opt -+++ b/gcc/config/arm/arm.opt -@@ -193,6 +193,10 @@ mthumb-interwork - Target Report Mask(INTERWORK) - Support calls between Thumb and ARM instruction sets. - -+minline-thumb1-jumptable -+Target Report Var(inline_thumb1_jump_table) -+Inline Thumb1 Jump table code -+ - mtls-dialect= - Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU) - Specify thread local storage scheme. -diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h -index 77f3055..32158ed 100644 ---- a/gcc/config/arm/elf.h -+++ b/gcc/config/arm/elf.h -@@ -56,8 +56,7 @@ - #undef SUBSUBTARGET_EXTRA_SPECS - #define SUBSUBTARGET_EXTRA_SPECS - --#ifndef ASM_SPEC --#define ASM_SPEC "\ -+#define DEFAULT_ASM_SPEC "\ - %{mbig-endian:-EB} \ - %{mlittle-endian:-EL} \ - %(asm_cpu_spec) \ -@@ -66,6 +65,9 @@ - %{mthumb-interwork:-mthumb-interwork} \ - %{mfloat-abi=*} %{mfpu=*} \ - %(subtarget_extra_asm_spec)" -+ -+#ifndef ASM_SPEC -+#define ASM_SPEC DEFAULT_ASM_SPEC - #endif - - /* The ARM uses @ are a comment character so we need to redefine -@@ -104,7 +106,8 @@ - the code more efficient, but for Thumb-1 it's better to put them out of - band unless we are generating compressed tables. */ - #define JUMP_TABLES_IN_TEXT_SECTION \ -- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) -+ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \ -+ && (optimize_size || flag_pic))) - - #ifndef LINK_SPEC - #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" -diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h -index ace8481..ed7f4d0 100644 ---- a/gcc/config/arm/linux-eabi.h -+++ b/gcc/config/arm/linux-eabi.h -@@ -108,11 +108,16 @@ - #define CC1_SPEC \ - LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \ - GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \ -- ANDROID_CC1_SPEC) -+ ANDROID_CC1_SPEC("-fpic")) - - #define CC1PLUS_SPEC \ - LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) - -+#undef ASM_SPEC -+#define ASM_SPEC \ -+ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \ -+ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC) -+ - #undef LIB_SPEC - #define LIB_SPEC \ - LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ -diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi -index 8f1307c..cbbec5b 100644 ---- a/gcc/config/arm/t-linux-androideabi -+++ b/gcc/config/arm/t-linux-androideabi -@@ -1,8 +1,9 @@ --MULTILIB_OPTIONS = march=armv7-a mthumb --MULTILIB_DIRNAMES = armv7-a thumb --MULTILIB_EXCEPTIONS = -+MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard -+MULTILIB_DIRNAMES = armv7-a thumb hard -+MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard* - MULTILIB_MATCHES = - MULTILIB_OSDIRNAMES = -+MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch - - # The "special" multilib can be used to build native applications for Android, - # as opposed to native shared libraries that are then called via JNI. -diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h -index 5ded869..5f51ac8 100644 ---- a/gcc/config/freebsd.h -+++ b/gcc/config/freebsd.h -@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see - #define LIB_SPEC FBSD_LIB_SPEC - - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif - - #ifdef TARGET_LIBC_PROVIDES_SSP - #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \ -diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h -index b0bf40a..d1874bc 100644 ---- a/gcc/config/gnu-user.h -+++ b/gcc/config/gnu-user.h -@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC - - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif - - #undef LINK_GCC_C_SEQUENCE_SPEC - #define LINK_GCC_C_SEQUENCE_SPEC \ -diff --git a/gcc/config/i386/arm_neon.h b/gcc/config/i386/arm_neon.h -new file mode 100644 -index 0000000..10944f3 ---- /dev/null -+++ b/gcc/config/i386/arm_neon.h -@@ -0,0 +1,16622 @@ -+//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com -+ -+//*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved. -+ -+//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -+ -+//By downloading, copying, installing or using the software you agree to this license. -+//If you do not agree to this license, do not download, install, copy or use the software. -+ -+// License Agreement -+ -+//Permission to use, copy, modify, and/or distribute this software for any -+//purpose with or without fee is hereby granted, provided that the above -+//copyright notice and this permission notice appear in all copies. -+ -+//THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH -+//REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY -+//AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, -+//INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM -+//LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR -+//OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -+//PERFORMANCE OF THIS SOFTWARE. -+ -+//***************************************************************************************** -+// This file is intended to simplify ARM->IA32 porting -+// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h") -+// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below -+// MMX instruction set is not used due to performance overhead and the necessity to use the -+// EMMS instruction (_mm_empty())for mmx-x87 floating point switching -+//***************************************************************************************** -+ -+//!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual. -+//!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for -+//!!!!!!! greater performance. It can be done by -msse4.2 compiler switch. -+ -+#ifndef NEON2SSE_H -+#define NEON2SSE_H -+ -+#ifndef USE_SSE4 -+#if defined(__SSE4_2__) -+ #define USE_SSE4 -+#endif -+#endif -+ -+#include //SSE -+#include //SSE2 -+#include //SSE3 -+#include //SSSE3 -+#ifdef USE_SSE4 -+#include //SSE4.1 -+#include //SSE4.2 -+#endif -+ -+ -+//*************** functions and data attributes, compiler dependent ********************************* -+//*********************************************************************************** -+#ifdef __GNUC__ -+#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -+#define _NEON2SSE_ALIGN_16 __attribute__((aligned(16))) -+#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -+#if _GCC_VERSION < 40500 -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function -+#else -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function -+#endif -+#if defined(__x86_64__) -+ #define _NEON2SSE_64BIT __x86_64__ -+#endif -+#else -+#define _NEON2SSE_ALIGN_16 __declspec(align(16)) -+#define _NEON2SSE_INLINE __inline -+#if defined(_MSC_VER)|| defined (__INTEL_COMPILER) -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function -+#if defined(_M_X64) -+ #define _NEON2SSE_64BIT _M_X64 -+#endif -+#else -+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function -+#endif -+#endif -+ -+#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4) -+ #define _NEON2SSE_64BIT_SSE4 -+#endif -+ -+/*********************************************************************************************************************/ -+// data types conversion -+/*********************************************************************************************************************/ -+#if defined(_MSC_VER) && (_MSC_VER < 1300) -+ typedef signed char int8_t; -+ typedef unsigned char uint8_t; -+ typedef signed short int16_t; -+ typedef unsigned short uint16_t; -+ typedef signed int int32_t; -+ typedef unsigned int uint32_t; -+ typedef signed long long int64_t; -+ typedef unsigned long long uint64_t; -+#elif defined(_MSC_VER) -+ typedef signed __int8 int8_t; -+ typedef unsigned __int8 uint8_t; -+ typedef signed __int16 int16_t; -+ typedef unsigned __int16 uint16_t; -+ typedef signed __int32 int32_t; -+ typedef unsigned __int32 uint32_t; -+ -+ typedef signed long long int64_t; -+ typedef unsigned long long uint64_t; -+#else -+#include -+#include -+#endif -+ -+typedef union __m64_128 { -+ uint64_t m64_u64[1]; -+ float m64_f32[2]; -+ int8_t m64_i8[8]; -+ int16_t m64_i16[4]; -+ int32_t m64_i32[2]; -+ int64_t m64_i64[1]; -+ uint8_t m64_u8[8]; -+ uint16_t m64_u16[4]; -+ uint32_t m64_u32[2]; -+} __m64_128; -+ -+typedef __m64_128 int8x8_t; -+typedef __m64_128 uint8x8_t; -+typedef __m64_128 int16x4_t; -+typedef __m64_128 uint16x4_t; -+typedef __m64_128 int32x2_t; -+typedef __m64_128 uint32x2_t; -+typedef __m64_128 int64x1_t; -+typedef __m64_128 uint64x1_t; -+typedef __m64_128 poly8x8_t; -+typedef __m64_128 poly16x4_t; -+ -+typedef __m64_128 float32x2_t; -+typedef __m128 float32x4_t; -+ -+typedef __m128 float16x4_t; //not supported by IA, for compatibility -+typedef __m128 float16x8_t; //not supported by IA, for compatibility -+ -+typedef __m128i int8x16_t; -+typedef __m128i int16x8_t; -+typedef __m128i int32x4_t; -+typedef __m128i int64x2_t; -+typedef __m128i uint8x16_t; -+typedef __m128i uint16x8_t; -+typedef __m128i uint32x4_t; -+typedef __m128i uint64x2_t; -+typedef __m128i poly8x16_t; -+typedef __m128i poly16x8_t; -+ -+#if defined(_MSC_VER) -+ #define SINT_MIN (-2147483647 - 1) /* min signed int value */ -+ #define SINT_MAX 2147483647 /* max signed int value */ -+#else -+ #define SINT_MIN INT_MIN /* min signed int value */ -+ #define SINT_MAX INT_MAX /* max signed int value */ -+#endif -+ -+typedef float float32_t; -+#if !defined(__clang__) -+typedef float __fp16; -+#endif -+ -+typedef uint8_t poly8_t; -+typedef uint16_t poly16_t; -+ -+ -+//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in -+//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types -+struct int8x16x2_t { -+ int8x16_t val[2]; -+}; -+struct int16x8x2_t { -+ int16x8_t val[2]; -+}; -+struct int32x4x2_t { -+ int32x4_t val[2]; -+}; -+struct int64x2x2_t { -+ int64x2_t val[2]; -+}; -+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!! -+struct int8x8x2_t { -+ int8x8_t val[2]; -+}; -+struct int16x4x2_t { -+ int16x4_t val[2]; -+}; -+struct int32x2x2_t { -+ int32x2_t val[2]; -+}; -+struct int64x1x2_t { -+ int64x1_t val[2]; -+}; -+ -+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy -+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy -+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy -+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy -+ -+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy -+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy -+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy -+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy -+ -+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */ -+typedef struct int8x16x2_t uint8x16x2_t; -+typedef struct int16x8x2_t uint16x8x2_t; -+typedef struct int32x4x2_t uint32x4x2_t; -+typedef struct int64x2x2_t uint64x2x2_t; -+typedef struct int8x16x2_t poly8x16x2_t; -+typedef struct int16x8x2_t poly16x8x2_t; -+ -+typedef struct int8x8x2_t uint8x8x2_t; -+typedef struct int16x4x2_t uint16x4x2_t; -+typedef struct int32x2x2_t uint32x2x2_t; -+typedef struct int64x1x2_t uint64x1x2_t; -+typedef struct int8x8x2_t poly8x8x2_t; -+typedef struct int16x4x2_t poly16x4x2_t; -+ -+//float -+struct float32x4x2_t { -+ float32x4_t val[2]; -+}; -+struct float16x8x2_t { -+ float16x8_t val[2]; -+}; -+struct float32x2x2_t { -+ float32x2_t val[2]; -+}; -+ -+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy -+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy -+typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy -+typedef float16x8x2_t float16x4x2_t; -+ -+//4 -+struct int8x16x4_t { -+ int8x16_t val[4]; -+}; -+struct int16x8x4_t { -+ int16x8_t val[4]; -+}; -+struct int32x4x4_t { -+ int32x4_t val[4]; -+}; -+struct int64x2x4_t { -+ int64x2_t val[4]; -+}; -+ -+struct int8x8x4_t { -+ int8x8_t val[4]; -+}; -+struct int16x4x4_t { -+ int16x4_t val[4]; -+}; -+struct int32x2x4_t { -+ int32x2_t val[4]; -+}; -+struct int64x1x4_t { -+ int64x1_t val[4]; -+}; -+ -+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy -+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy -+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy -+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy -+ -+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy -+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy -+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy -+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy -+ -+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ -+typedef struct int8x8x4_t uint8x8x4_t; -+typedef struct int16x4x4_t uint16x4x4_t; -+typedef struct int32x2x4_t uint32x2x4_t; -+typedef struct int64x1x4_t uint64x1x4_t; -+typedef struct int8x8x4_t poly8x8x4_t; -+typedef struct int16x4x4_t poly16x4x4_t; -+ -+typedef struct int8x16x4_t uint8x16x4_t; -+typedef struct int16x8x4_t uint16x8x4_t; -+typedef struct int32x4x4_t uint32x4x4_t; -+typedef struct int64x2x4_t uint64x2x4_t; -+typedef struct int8x16x4_t poly8x16x4_t; -+typedef struct int16x8x4_t poly16x8x4_t; -+ -+struct float32x4x4_t { -+ float32x4_t val[4]; -+}; -+struct float16x8x4_t { -+ float16x8_t val[4]; -+}; -+struct float32x2x4_t { -+ float32x2_t val[4]; -+}; -+ -+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy -+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy -+typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy -+typedef float16x8x4_t float16x4x4_t; -+ -+//3 -+struct int16x8x3_t { -+ int16x8_t val[3]; -+}; -+struct int32x4x3_t { -+ int32x4_t val[3]; -+}; -+struct int64x2x3_t { -+ int64x2_t val[3]; -+}; -+struct int8x16x3_t { -+ int8x16_t val[3]; -+}; -+ -+struct int16x4x3_t { -+ int16x4_t val[3]; -+}; -+struct int32x2x3_t { -+ int32x2_t val[3]; -+}; -+struct int64x1x3_t { -+ int64x1_t val[3]; -+}; -+struct int8x8x3_t { -+ int8x8_t val[3]; -+}; -+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy -+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy -+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy -+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy -+ -+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy -+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy -+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy -+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy -+ -+ -+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ -+typedef struct int8x16x3_t uint8x16x3_t; -+typedef struct int16x8x3_t uint16x8x3_t; -+typedef struct int32x4x3_t uint32x4x3_t; -+typedef struct int64x2x3_t uint64x2x3_t; -+typedef struct int8x16x3_t poly8x16x3_t; -+typedef struct int16x8x3_t poly16x8x3_t; -+typedef struct int8x8x3_t uint8x8x3_t; -+typedef struct int16x4x3_t uint16x4x3_t; -+typedef struct int32x2x3_t uint32x2x3_t; -+typedef struct int64x1x3_t uint64x1x3_t; -+typedef struct int8x8x3_t poly8x8x3_t; -+typedef struct int16x4x3_t poly16x4x3_t; -+ -+//float -+struct float32x4x3_t { -+ float32x4_t val[3]; -+}; -+struct float32x2x3_t { -+ float32x2_t val[3]; -+}; -+struct float16x8x3_t { -+ float16x8_t val[3]; -+}; -+ -+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy -+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy -+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy -+typedef float16x8x3_t float16x4x3_t; -+ -+ -+//**************************************************************************** -+//****** Porting auxiliary macros ******************************************** -+ -+//** floating point related macros ** -+#define _M128i(a) _mm_castps_si128(a) -+#define _M128(a) _mm_castsi128_ps(a) -+//here the most performance effective implementation is compiler and 32/64 bits build dependent -+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) ) -+ -+ #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a))) -+ #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp); -+ #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp)); -+#else -+ //for 32bit gcc and Microsoft compilers builds -+ #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a)) -+ #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp) -+ #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp)) -+#endif -+#define _pM128(a) _mm_castsi128_ps(_pM128i(a)) -+ -+#define return64(a) _M64(res64,a); return res64; -+#define return64f(a) _M64f(res64,a); return res64; -+ -+#define _Ui64(a) (*(uint64_t*)&(a)) -+#define _UNSIGNED_T(a) u ## a -+ -+#define _SIGNBIT64 ((uint64_t)1 << 63) -+#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6)) -+#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) ) -+ -+#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it" -+#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it" -+ -+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+#define __constrange(min,max) const -+#define __transfersize(size) -+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ -+//************************************************************************* -+//************************************************************************* -+//********* Functions declarations as declared in original arm_neon.h ***** -+//************************************************************************* -+//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes. -+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 -+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 -+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 -+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 -+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 -+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 -+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 -+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 -+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 -+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 -+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 -+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 -+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 -+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 -+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 -+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 -+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 -+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 -+//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. -+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 -+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 -+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 -+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 -+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0 -+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 -+//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i] -+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 -+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 -+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 -+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 -+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0 -+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 -+//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 -+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 -+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 -+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 -+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0 -+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0 -+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 -+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 -+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0 -+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 -+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 -+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0 -+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 -+//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1 -+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 -+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 -+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 -+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 -+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0 -+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 -+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 -+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 -+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 -+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 -+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0 -+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 -+//Vector saturating add: vqadd -> Vr[i]:=sat(Va[i]+Vb[i]) -+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 -+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 -+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 -+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 -+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 -+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0 -+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 -+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 -+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 -+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 -+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 -+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 -+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 -+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0 -+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 -+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 -+//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i] -+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 -+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 -+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 -+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 -+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 -+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 -+//Vector rounding add high half: vraddhn -+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 -+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 -+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 -+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 -+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 -+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 -+//Multiplication -+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] -+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 -+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 -+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 -+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 -+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 -+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 -+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 -+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 -+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 -+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 -+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 -+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 -+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 -+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 -+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 -+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 -+//multiply lane -+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); -+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); -+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); -+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); -+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); -+int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c); -+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); -+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); -+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); -+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); -+//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] -+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 -+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 -+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 -+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 -+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 -+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 -+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 -+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 -+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 -+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 -+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 -+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 -+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 -+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 -+//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i] -+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 -+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 -+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 -+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 -+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0 -+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 -+//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] -+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 -+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 -+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 -+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 -+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 -+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 -+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 -+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 -+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 -+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 -+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 -+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 -+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 -+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 -+//Vector multiply subtract long -+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 -+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 -+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 -+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 -+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0 -+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 -+//Vector saturating doubling multiply high -+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 -+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 -+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 -+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 -+//Vector saturating rounding doubling multiply high -+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 -+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 -+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 -+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 -+//Vector saturating doubling multiply accumulate long -+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 -+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 -+//Vector saturating doubling multiply subtract long -+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 -+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 -+//Vector long multiply -+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 -+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 -+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 -+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 -+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0 -+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 -+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 -+//Vector saturating doubling long multiply -+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 -+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 -+//Subtraction -+//Vector subtract -+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 -+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 -+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 -+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 -+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 -+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 -+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 -+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 -+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 -+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 -+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 -+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 -+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 -+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 -+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 -+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 -+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 -+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 -+//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i] -+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 -+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 -+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 -+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 -+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0 -+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 -+//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i] -+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 -+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 -+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 -+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 -+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0 -+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 -+//Vector saturating subtract -+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 -+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 -+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 -+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 -+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 -+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0 -+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 -+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 -+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 -+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 -+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 -+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 -+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 -+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0 -+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 -+uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0 -+//Vector halving subtract -+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 -+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 -+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 -+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 -+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0 -+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 -+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 -+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 -+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 -+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 -+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0 -+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 -+//Vector subtract high half -+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 -+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 -+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 -+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 -+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 -+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 -+//Vector rounding subtract high half -+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+//Comparison -+//Vector compare equal -+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 -+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 -+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 -+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 -+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 -+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 -+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 -+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 -+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 -+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 -+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 -+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 -+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 -+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 -+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 -+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 -+//Vector compare greater-than or equal -+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 -+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 -+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 -+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+//Vector compare less-than or equal -+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 -+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 -+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 -+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+//Vector compare greater-than -+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 -+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 -+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+//Vector compare less-than -+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 -+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 -+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+//Vector compare absolute greater-than or equal -+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+//Vector compare absolute less-than or equal -+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+//Vector compare absolute greater-than -+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+//Vector compare absolute less-than -+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+//Vector test bits -+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 -+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 -+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 -+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 -+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 -+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 -+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 -+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 -+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 -+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 -+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 -+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 -+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 -+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 -+//Absolute difference -+//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] | -+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 -+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 -+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 -+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 -+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0 -+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 -+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 -+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 -+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 -+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 -+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 -+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0 -+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 -+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 -+//Absolute difference - long -+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 -+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 -+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 -+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 -+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0 -+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 -+//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | -+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 -+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 -+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 -+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 -+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0 -+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 -+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 -+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 -+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 -+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 -+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0 -+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 -+//Absolute difference and accumulate - long -+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 -+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 -+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 -+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 -+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0 -+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 -+//Max/Min -+//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] -+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 -+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 -+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 -+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 -+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0 -+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 -+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 -+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 -+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 -+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 -+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 -+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 -+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 -+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 -+//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] -+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 -+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 -+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 -+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 -+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0 -+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 -+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 -+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 -+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 -+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 -+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 -+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 -+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 -+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 -+//Pairwise addition -+//Pairwise add -+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 -+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 -+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 -+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 -+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 -+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 -+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 -+//Long pairwise add -+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 -+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 -+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 -+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 -+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0 -+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 -+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 -+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 -+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 -+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 -+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0 -+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 -+//Long pairwise add and accumulate -+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 -+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 -+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 -+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 -+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0 -+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 -+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 -+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 -+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 -+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 -+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0 -+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 -+//Folding maximum vpmax -> takes maximum of adjacent pairs -+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 -+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 -+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 -+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 -+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0 -+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 -+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 -+//Folding minimum vpmin -> takes minimum of adjacent pairs -+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 -+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 -+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 -+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 -+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0 -+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 -+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 -+//Reciprocal/Sqrt -+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 -+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 -+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 -+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 -+//Shifts by signed variable -+//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) -+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 -+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 -+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 -+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 -+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 -+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0 -+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 -+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 -+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 -+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 -+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 -+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 -+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 -+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0 -+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 -+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 -+//Vector saturating shift left: (negative values shift right) -+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 -+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 -+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 -+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 -+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 -+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0 -+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 -+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 -+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 -+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 -+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 -+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 -+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 -+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0 -+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 -+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 -+//Vector rounding shift left: (negative values shift right) -+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 -+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 -+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 -+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 -+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 -+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0 -+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 -+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 -+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 -+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 -+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 -+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 -+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 -+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0 -+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 -+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 -+//Vector saturating rounding shift left: (negative values shift right) -+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 -+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 -+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 -+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 -+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 -+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0 -+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 -+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 -+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 -+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 -+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 -+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 -+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 -+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0 -+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 -+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 -+//Shifts by a constant -+//Vector shift right by constant -+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 -+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 -+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 -+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 -+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 -+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16 -+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 -+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 -+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 -+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 -+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 -+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 -+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 -+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16 -+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 -+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 -+//Vector shift left by constant -+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+//Vector rounding shift right by constant -+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 -+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 -+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 -+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 -+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 -+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16 -+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 -+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 -+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 -+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 -+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 -+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 -+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 -+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16 -+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 -+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 -+//Vector shift right by constant and accumulate -+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 -+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 -+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 -+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 -+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 -+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16 -+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 -+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 -+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 -+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 -+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 -+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 -+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 -+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16 -+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 -+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 -+//Vector rounding shift right by constant and accumulate -+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 -+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 -+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 -+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 -+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 -+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16 -+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 -+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 -+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 -+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 -+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 -+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 -+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 -+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16 -+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 -+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 -+//Vector saturating shift left by constant -+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 -+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 -+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 -+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 -+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 -+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0 -+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 -+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 -+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 -+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 -+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 -+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 -+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 -+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0 -+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 -+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 -+//Vector signed->unsigned saturating shift left by constant -+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 -+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 -+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 -+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 -+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 -+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 -+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 -+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 -+//Vector narrowing shift right by constant -+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+//Vector signed->unsigned narrowing saturating shift right by constant -+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 -+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 -+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 -+//Vector signed->unsigned rounding narrowing saturating shift right by constant -+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 -+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 -+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 -+//Vector narrowing saturating shift right by constant -+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 -+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 -+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 -+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8 -+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 -+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 -+//Vector rounding narrowing shift right by constant -+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+//Vector rounding narrowing saturating shift right by constant -+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 -+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 -+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 -+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8 -+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 -+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 -+//Vector widening shift left by constant -+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 -+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 -+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 -+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 -+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0 -+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 -+//Shifts with insert -+//Vector shift right and insert -+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+//Vector shift left and insert -+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+//Loads of a single vector or lane. Perform loads and stores of a single vector of some type. -+//Load a single vector from memory -+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] -+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] -+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] -+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] -+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] -+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] -+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] -+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] -+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] -+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] -+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] -+//Load a single lane from memory -+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] -+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] -+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] -+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] -+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0] -+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] -+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0] -+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] -+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] -+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] -+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] -+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0] -+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] -+float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] -+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] -+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] -+//Load all lanes of vector with same value from memory -+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+//Store a single vector or lane. Stores all lanes or a single lane of a vector. -+//Store a single vector into memory -+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] -+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] -+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] -+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] -+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] -+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] -+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] -+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] -+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] -+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] -+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] -+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] -+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] -+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] -+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] -+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] -+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] -+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] -+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] -+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] -+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] -+//Store a lane of a vector into memory -+//Loads of an N-element structure -+//Load N-element structure from memory -+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] -+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] -+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+//Load all lanes of N-element structure with same value from memory -+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+//Load a single lane of N-element structure from memory -+//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned -+uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] -+int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] -+float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] -+poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] -+uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] -+uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] -+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0] -+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0] -+//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] -+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+//Store N-element structure to memory -+void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0] -+void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0] -+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0] -+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0] -+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0] -+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0] -+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0] -+void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0] -+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0] -+void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0] -+void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0] -+//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0] -+void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] -+void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] -+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] -+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] -+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] -+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] -+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] -+void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] -+void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] -+void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] -+void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] -+void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] -+void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] -+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] -+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] -+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] -+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] -+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] -+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] -+void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] -+void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] -+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] -+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] -+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] -+//Store a single lane of N-element structure to memory -+void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] -+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] -+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0] -+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] -+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] -+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] -+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0] -+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] -+void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] -+void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] -+void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] -+void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] -+void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0] -+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] -+void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] -+//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector. -+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] -+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] -+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] -+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] -+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] -+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] -+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] -+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] -+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector. -+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+//Initialize a vector from a literal bit pattern. -+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 -+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 -+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 -+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 -+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 -+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 -+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 -+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 -+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 -+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 -+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 -+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 -+//Set all lanes to same value -+//Load all lanes of vector to the same literal value -+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 -+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 -+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 -+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 -+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 -+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 -+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 -+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 -+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 -+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 -+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 -+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 -+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 -+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 -+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 -+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 -+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 -+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 -+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 -+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 -+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 -+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 -+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 -+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 -+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 -+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 -+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 -+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 -+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 -+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+//Load all lanes of the vector to the value of a lane of a vector -+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector. -+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 -+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 -+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 -+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 -+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 -+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 -+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 -+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 -+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 -+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 -+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 -+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 -+//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors -+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 -+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 -+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 -+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 -+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 -+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 -+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 -+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 -+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 -+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 -+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 -+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 -+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 -+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 -+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 -+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 -+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 -+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 -+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 -+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 -+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 -+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 -+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 -+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 -+//Converting vectors. These intrinsics are used to convert vectors. -+//Convert from float -+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 -+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 -+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 -+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 -+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 -+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 -+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 -+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 -+//Convert to float -+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 -+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 -+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 -+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 -+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 -+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 -+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 -+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 -+//Convert between floats -+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 -+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 -+//Vector narrow integer -+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 -+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 -+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 -+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 -+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 -+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 -+//Vector long move -+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 -+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 -+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 -+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 -+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0 -+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 -+//Vector saturating narrow integer -+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 -+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 -+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 -+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0 -+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0 -+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0 -+//Vector saturating narrow integer signed->unsigned -+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 -+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 -+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 -+//Table look up -+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 -+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+//Extended table look up intrinsics -+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 -+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+//Operations with a scalar value -+//Vector multiply accumulate with scalar -+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] -+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] -+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] -+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] -+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0] -+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0] -+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0] -+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0] -+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0] -+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0] -+//Vector widening multiply accumulate with scalar -+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0] -+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0] -+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0] -+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0] -+//Vector widening saturating doubling multiply accumulate with scalar -+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0] -+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0] -+//Vector multiply subtract with scalar -+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] -+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] -+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] -+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] -+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0] -+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0] -+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0] -+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0] -+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0] -+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0] -+//Vector widening multiply subtract with scalar -+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0] -+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0] -+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0] -+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0] -+//Vector widening saturating doubling multiply subtract with scalar -+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0] -+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0] -+//Vector multiply by scalar -+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] -+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] -+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] -+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] -+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] -+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] -+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] -+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] -+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] -+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] -+//Vector long multiply with scalar -+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] -+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] -+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0] -+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] -+//Vector long multiply by scalar -+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] -+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] -+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0] -+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] -+//Vector saturating doubling long multiply with scalar -+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] -+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] -+//Vector saturating doubling long multiply by scalar -+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] -+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] -+//Vector saturating doubling multiply high with scalar -+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] -+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] -+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] -+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] -+//Vector saturating doubling multiply high by scalar -+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] -+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] -+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] -+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] -+//Vector saturating rounding doubling multiply high with scalar -+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] -+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] -+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] -+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] -+//Vector rounding saturating doubling multiply high by scalar -+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] -+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] -+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] -+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] -+//Vector multiply accumulate with scalar -+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] -+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] -+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] -+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] -+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] -+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] -+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] -+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] -+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] -+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] -+//Vector widening multiply accumulate with scalar -+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] -+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] -+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0] -+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] -+//Vector widening saturating doubling multiply accumulate with scalar -+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] -+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] -+//Vector multiply subtract with scalar -+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] -+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] -+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] -+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] -+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] -+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] -+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] -+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] -+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] -+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] -+//Vector widening multiply subtract with scalar -+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] -+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] -+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0] -+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] -+//Vector widening saturating doubling multiply subtract with scalar -+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] -+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] -+//Vector extract -+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 -+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 -+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 -+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 -+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 -+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 -+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 -+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 -+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 -+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 -+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 -+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 -+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 -+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 -+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 -+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 -+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 -+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 -+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0 -+//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. -+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0 -+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0 -+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0 -+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0 -+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0 -+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0 -+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0 -+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0 -+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0 -+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 -+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 -+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 -+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 -+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 -+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 -+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 -+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 -+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 -+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0 -+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0 -+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0 -+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0 -+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0 -+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0 -+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 -+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 -+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 -+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 -+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 -+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0 -+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0 -+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0 -+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0 -+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 -+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 -+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 -+//Other single operand arithmetic -+//Absolute: Vd[i] = |Va[i]| -+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0 -+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0 -+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0 -+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0 -+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 -+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 -+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 -+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 -+//Saturating absolute: Vd[i] = sat(|Va[i]|) -+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 -+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 -+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0 -+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 -+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 -+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 -+//Negate: Vd[i] = - Va[i] -+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0 -+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0 -+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0 -+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0 -+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 -+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 -+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 -+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 -+//Saturating Negate: sat(Vd[i] = - Va[i]) -+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0 -+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0 -+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0 -+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 -+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 -+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 -+//Count leading sign bits -+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 -+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 -+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 -+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 -+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 -+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 -+//Count leading zeros -+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0 -+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0 -+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0 -+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0 -+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0 -+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0 -+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 -+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 -+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 -+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 -+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 -+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 -+//Count number of set bits -+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 -+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 -+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 -+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 -+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 -+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 -+//Reciprocal estimate -+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 -+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 -+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 -+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 -+//Reciprocal square root estimate -+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 -+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 -+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 -+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 -+//Logical operations -+//Bitwise not -+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 -+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 -+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 -+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 -+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 -+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 -+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 -+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 -+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 -+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 -+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 -+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 -+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 -+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 -+//Bitwise and -+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 -+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 -+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 -+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 -+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 -+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 -+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 -+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 -+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 -+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 -+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 -+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 -+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 -+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 -+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 -+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 -+//Bitwise or -+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 -+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 -+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 -+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 -+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 -+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 -+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 -+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 -+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 -+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 -+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 -+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 -+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 -+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 -+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 -+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 -+//Bitwise exclusive or (EOR or XOR) -+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 -+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 -+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 -+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 -+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 -+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 -+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 -+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 -+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 -+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 -+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 -+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 -+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 -+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 -+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 -+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 -+//Bit Clear -+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 -+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 -+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 -+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 -+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 -+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 -+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 -+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 -+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 -+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 -+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 -+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 -+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 -+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 -+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 -+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 -+//Bitwise OR complement -+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 -+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 -+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 -+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 -+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 -+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 -+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 -+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 -+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 -+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 -+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 -+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 -+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 -+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 -+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 -+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 -+//Bitwise Select -+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 -+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 -+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 -+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 -+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 -+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 -+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 -+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 -+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 -+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 -+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 -+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 -+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 -+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 -+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 -+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 -+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 -+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 -+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 -+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 -+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 -+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 -+//Transposition operations -+//Transpose elements -+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 -+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 -+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 -+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 -+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 -+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 -+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 -+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 -+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 -+int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 -+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 -+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 -+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 -+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 -+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 -+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 -+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 -+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 -+//Interleave elements -+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 -+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 -+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 -+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 -+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 -+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 -+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 -+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 -+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 -+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 -+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 -+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 -+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 -+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 -+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 -+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 -+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 -+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 -+//De-Interleave elements -+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 -+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 -+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 -+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 -+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 -+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 -+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 -+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 -+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 -+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 -+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 -+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 -+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 -+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 -+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 -+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 -+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 -+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 -+ -+ -+//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -+// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must, -+//for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal -+// -+#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers. -+ -+ #define _MM_ALIGNR_EPI8 _mm_alignr_epi8 -+ -+ #define _MM_EXTRACT_EPI16 _mm_extract_epi16 -+ #define _MM_INSERT_EPI16 _mm_insert_epi16 -+#ifdef USE_SSE4 -+ #define _MM_EXTRACT_EPI8 _mm_extract_epi8 -+ #define _MM_EXTRACT_EPI32 _mm_extract_epi32 -+ #define _MM_EXTRACT_PS _mm_extract_ps -+ -+ #define _MM_INSERT_EPI8 _mm_insert_epi8 -+ #define _MM_INSERT_EPI32 _mm_insert_epi32 -+ #define _MM_INSERT_PS _mm_insert_ps -+#ifdef _NEON2SSE_64BIT -+ #define _MM_INSERT_EPI64 _mm_insert_epi64 -+ #define _MM_EXTRACT_EPI64 _mm_extract_epi64 -+#endif -+#endif //SSE4 -+#else -+ #define _NEON2SSE_COMMA , -+ #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \ -+ switch(LANE) \ -+ { \ -+ case 0: return NAME(a b, 0); \ -+ case 1: return NAME(a b, 1); \ -+ case 2: return NAME(a b, 2); \ -+ case 3: return NAME(a b, 3); \ -+ case 4: return NAME(a b, 4); \ -+ case 5: return NAME(a b, 5); \ -+ case 6: return NAME(a b, 6); \ -+ case 7: return NAME(a b, 7); \ -+ case 8: return NAME(a b, 8); \ -+ case 9: return NAME(a b, 9); \ -+ case 10: return NAME(a b, 10); \ -+ case 11: return NAME(a b, 11); \ -+ case 12: return NAME(a b, 12); \ -+ case 13: return NAME(a b, 13); \ -+ case 14: return NAME(a b, 14); \ -+ case 15: return NAME(a b, 15); \ -+ default: return NAME(a b, 0); \ -+ } -+ -+ #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \ -+ switch(LANE) \ -+ { \ -+ case 0: return NAME(vec p,0); \ -+ case 1: return NAME(vec p,1); \ -+ case 2: return NAME(vec p,2); \ -+ case 3: return NAME(vec p,3); \ -+ case 4: return NAME(vec p,4); \ -+ case 5: return NAME(vec p,5); \ -+ case 6: return NAME(vec p,6); \ -+ case 7: return NAME(vec p,7); \ -+ default: return NAME(vec p,0); \ -+ } -+ -+ #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \ -+ switch(LANE) \ -+ { \ -+ case case0: return NAME(vec p,case0); \ -+ case case1: return NAME(vec p,case1); \ -+ case case2: return NAME(vec p,case2); \ -+ case case3: return NAME(vec p,case3); \ -+ default: return NAME(vec p,case0); \ -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) -+ { -+ _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE) -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p) -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,) -+ } -+ -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,) -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,) -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE) -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p) -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE) -+ } -+ -+#ifdef _NEON2SSE_64BIT -+ //the special case of functions available only for SSE4 and 64-bit build. -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE) -+ { -+ switch(LANE) { -+ case 0: -+ return _mm_insert_epi64(vec, p, 0); -+ case 1: -+ return _mm_insert_epi64(vec, p, 1); -+ default: -+ return _mm_insert_epi64(vec, p, 0); -+ } -+ } -+ -+ _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE) -+ { -+ if (LANE ==0) return _mm_extract_epi64(val, 0); -+ else return _mm_extract_epi64(val, 1); -+ } -+#endif -+ -+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) -+ { -+ _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p) -+ } -+ -+#endif //USE_SSE4 -+ -+#endif //#ifdef NDEBUG -+ -+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices -+// or for some specific commonly used operations implementation missing in SSE -+#ifdef USE_SSE4 -+ #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16 -+ #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32 -+ #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64 -+ -+ #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16 -+ #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32 -+ #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64 -+ -+ #define _MM_MAX_EPI8 _mm_max_epi8 -+ #define _MM_MAX_EPI32 _mm_max_epi32 -+ #define _MM_MAX_EPU16 _mm_max_epu16 -+ #define _MM_MAX_EPU32 _mm_max_epu32 -+ -+ #define _MM_MIN_EPI8 _mm_min_epi8 -+ #define _MM_MIN_EPI32 _mm_min_epi32 -+ #define _MM_MIN_EPU16 _mm_min_epu16 -+ #define _MM_MIN_EPU32 _mm_min_epu32 -+ -+ #define _MM_BLENDV_EPI8 _mm_blendv_epi8 -+ #define _MM_PACKUS_EPI32 _mm_packus_epi32 -+ #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a) -+ -+ #define _MM_MULLO_EPI32 _mm_mullo_epi32 -+ #define _MM_MUL_EPI32 _mm_mul_epi32 -+ -+ #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64 -+#else //no SSE4 !!!!!! -+ _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ return _mm_unpacklo_epi8(a, zero); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ return _mm_unpacklo_epi16(a, zero); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ return _mm_unpacklo_epi32(a, zero); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ __m128i sign = _mm_cmpgt_epi8(zero, a); -+ return _mm_unpacklo_epi8(a, sign); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ __m128i sign = _mm_cmpgt_epi16(zero, a); -+ return _mm_unpacklo_epi16(a, sign); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a) -+ { -+ __m128i zero = _mm_setzero_si128(); -+ __m128i sign = _mm_cmpgt_epi32(zero, a); -+ return _mm_unpacklo_epi32(a, sign); -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t tmp[4]; -+ _mm_store_si128((__m128i*)tmp, vec); -+ return tmp[LANE]; -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int8_t tmp[16]; -+ _mm_store_si128((__m128i*)tmp, vec); -+ return (int)tmp[LANE]; -+ } -+ -+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t tmp[4]; -+ _mm_store_si128((__m128i*)tmp, _M128i(vec)); -+ return tmp[LANE]; -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0}; -+ _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; -+ __m128i vec_masked, p_masked; -+ pvec[LANE] = p; -+ mask[LANE] = 0x0; -+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p -+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec -+ return _mm_or_si128(vec_masked, p_masked); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; -+ _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; -+ __m128i vec_masked, p_masked; -+ pvec[LANE] = (int8_t)p; -+ mask[LANE] = 0x0; -+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p -+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec -+ return _mm_or_si128(vec_masked, p_masked); -+ } -+ -+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) -+ { -+ _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; -+ __m128 tmp, vec_masked, p_masked; -+ mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it -+ vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p -+ p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec -+ tmp = _mm_or_ps(vec_masked, p_masked); -+ return tmp; -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi8 (a, b); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi32(a, b); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b) -+ { -+ __m128i c8000, b_s, a_s, cmp; -+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff -+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 -+ b_s = _mm_sub_epi16 (b, c8000); -+ a_s = _mm_sub_epi16 (a, c8000); -+ cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b) -+ { -+ __m128i c80000000, b_s, a_s, cmp; -+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff -+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 -+ b_s = _mm_sub_epi32 (b, c80000000); -+ a_s = _mm_sub_epi32 (a, c80000000); -+ cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi8 (b, a); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b) -+ { -+ __m128i cmp, resa, resb; -+ cmp = _mm_cmpgt_epi32(b, a); -+ resa = _mm_and_si128 (cmp, a); -+ resb = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(resa, resb); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b) -+ { -+ __m128i c8000, b_s, a_s, cmp; -+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff -+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 -+ b_s = _mm_sub_epi16 (b, c8000); -+ a_s = _mm_sub_epi16 (a, c8000); -+ cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b) -+ { -+ __m128i c80000000, b_s, a_s, cmp; -+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff -+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 -+ b_s = _mm_sub_epi32 (b, c80000000); -+ a_s = _mm_sub_epi32 (a, c80000000); -+ cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed -+ a_s = _mm_and_si128 (cmp,a); -+ b_s = _mm_andnot_si128 (cmp,b); -+ return _mm_or_si128(a_s, b_s); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below -+ { -+ //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters. -+ __m128i a_masked, b_masked; -+ b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff -+ a_masked = _mm_andnot_si128 (mask,a); -+ return _mm_or_si128(a_masked, b_masked); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b) -+ { -+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; -+ __m128i a16, b16, res, reshi,cmp, zero; -+ zero = _mm_setzero_si128(); -+ a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); -+ b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); -+ res = _mm_unpacklo_epi64(a16, b16); //result without saturation -+ reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation -+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero -+ res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 -+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive -+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a) -+ { -+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; -+ __m128i a16, res, reshi,cmp, zero; -+ zero = _mm_setzero_si128(); -+ a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); -+ reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation -+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero -+ res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 -+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive -+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff -+ } -+ -+ -+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL) -+ { -+ _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4]; -+ int64_t res64; -+ int i; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ for (i = 0; i<4; i++) { -+ res64 = atmp[i] * btmp[i]; -+ res[i] = (int)(res64 & 0xffffffff); -+ } -+ return _mm_load_si128((__m128i*)res); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b) -+ { -+ __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg; -+ sign = _mm_xor_si128 (a, b); -+ sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive -+ zero = _mm_setzero_si128(); -+ a_neg = _mm_abs_epi32 (a); //negate a and b -+ b_neg = _mm_abs_epi32 (b); //negate a and b -+ mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result -+ mul_us_neg = _mm_sub_epi64(zero, mul_us); -+ mul_us_neg = _mm_and_si128(sign, mul_us_neg); -+ mul_us = _mm_andnot_si128(sign, mul_us); -+ return _mm_or_si128 (mul_us, mul_us_neg); -+ } -+ -+ _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b) -+ { -+ __m128i res; -+ res = _mm_cmpeq_epi32 (a, b); -+ return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data -+ } -+#endif //SSE4 -+ -+//the special case of functions working only for 32 bits, no SSE4 -+_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0}; -+ _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff}; -+ __m128i vec_masked, p_masked; -+ pvec[LANE] = p; -+ mask[LANE] = 0x0; -+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p -+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec -+ return _mm_or_si128(vec_masked, p_masked); -+} -+ -+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE) -+{ -+ _NEON2SSE_ALIGN_16 int64_t tmp[2]; -+ _mm_store_si128((__m128i*)tmp, val); -+ return tmp[LANE]; -+} -+ -+#ifndef _NEON2SSE_64BIT_SSE4 -+ #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32 -+ #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32 -+#endif -+ -+int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints -+_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a) -+{ -+ //Overflow happens only if a and sum have the opposite signs -+ __m128i c7fffffff, res, res_sat, res_xor_a; -+ c7fffffff = _mm_set1_epi32(0x7fffffff); -+ res = _mm_slli_epi32 (a, 1); // res = a*2 -+ res_sat = _mm_srli_epi32(a, 31); -+ res_sat = _mm_add_epi32(res_sat, c7fffffff); -+ res_xor_a = _mm_xor_si128(res, a); -+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise -+ res_sat = _mm_and_si128(res_xor_a, res_sat); -+ res = _mm_andnot_si128(res_xor_a, res); -+ return _mm_or_si128(res, res_sat); -+} -+ -+ -+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -+//************************************************************************* -+//************************************************************************* -+//***************** Functions redefinition\implementatin starts here ***** -+//************************************************************************* -+//************************************************************************* -+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -+ -+/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample: -+#ifdef ARM -+#define vector_addq_s32 _mm_add_epi32 -+#else //if we have IA -+#define vector_addq_s32 vadd_s32 -+#endif -+ -+******************************************************************************************** -+Functions below are organised in the following way: -+ -+Each NEON intrinsic function has one of the following options: -+1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement -+2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement -+3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition -+4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance -+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path -+- please consider such functions removal from your code. -+*/ -+ -+//*********************************************************************** -+//************************ Vector add ***************************** -+//*********************************************************************** -+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_add_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_add_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_add_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res64; -+ res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0]; -+ return res64; -+} -+ -+ -+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b) -+{ -+ __m128 res; -+ __m64_128 res64; -+ res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 -+#define vadd_u8 vadd_s8 -+ -+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 -+#define vadd_u16 vadd_s16 -+ -+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 -+#define vadd_u32 vadd_s32 -+ -+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 -+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) -+{ -+ uint64x1_t res64; -+ res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0]; -+ return res64; -+} -+ -+ -+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 -+#define vaddq_s8 _mm_add_epi8 -+ -+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 -+#define vaddq_s16 _mm_add_epi16 -+ -+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 -+#define vaddq_s32 _mm_add_epi32 -+ -+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 -+#define vaddq_s64 _mm_add_epi64 -+ -+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 -+#define vaddq_f32 _mm_add_ps -+ -+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 -+#define vaddq_u8 _mm_add_epi8 -+ -+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 -+#define vaddq_u16 _mm_add_epi16 -+ -+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 -+#define vaddq_u32 _mm_add_epi32 -+ -+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 -+#define vaddq_u64 _mm_add_epi64 -+ -+//**************************** Vector long add *****************************: -+//*********************************************************************** -+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. -+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_add_epi16 (a16, b16); -+} -+ -+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi32 (a32, b32); -+} -+ -+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 ( a64, b64); -+} -+ -+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1 -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi16 (a16, b16); -+} -+ -+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi32 (a32, b32); -+} -+ -+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 (a64, b64); -+} -+ -+//*************** Vector wide add: vaddw_. Vr[i]:=Va[i]+Vb[i] ****************** -+//*************** ********************************************************************* -+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 -+_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_add_epi16 (a, b16); -+} -+ -+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 -+_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1, -+ return _mm_add_epi32 (a, b32); -+} -+ -+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 -+_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 (a, b64); -+} -+ -+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 -+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi16 (a, b16); -+} -+ -+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0 -+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi32 (a, b32); -+} -+ -+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 -+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_add_epi64 (a, b64); -+} -+ -+//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated ******************************* -+//************************************************************************************************************************* -+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vhaddq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64( vhaddq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64( vhaddq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64( vhaddq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64( vhaddq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64( vhaddq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b) -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = vshrq_n_s8(tmp2,1); -+ return _mm_add_epi8(tmp1,tmp2); -+} -+ -+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b) -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = _mm_srai_epi16(tmp2,1); -+ return _mm_add_epi16(tmp1,tmp2); -+} -+ -+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0 -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = _mm_srai_epi32(tmp2,1); -+ return _mm_add_epi32(tmp1,tmp2); -+} -+ -+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0 -+{ -+ __m128i c1, sum, res; -+ c1 = _mm_set1_epi8(1); -+ sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it -+ res = _mm_xor_si128(a, b); //for rounding compensation -+ res = _mm_and_si128(res,c1); //for rounding compensation -+ return _mm_sub_epi8 (sum, res); //actual rounding compensation -+} -+ -+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0 -+{ -+ __m128i sum, res; -+ sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it -+ res = _mm_xor_si128(a, b); //for rounding compensation -+ res = _mm_slli_epi16 (res,15); //shift left then back right to -+ res = _mm_srli_epi16 (res,15); //get 1 or zero -+ return _mm_sub_epi16 (sum, res); //actual rounding compensation -+} -+ -+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0 -+{ -+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_and_si128(a,b); -+ tmp2 = _mm_xor_si128(a,b); -+ tmp2 = _mm_srli_epi32(tmp2,1); -+ return _mm_add_epi32(tmp1,tmp2); -+} -+ -+//************************Vector rounding halving add: vrhadd{q}_. Vr[i]:=(Va[i]+Vb[i]+1)>>1 *************************** -+//***************************************************************************************************************************** -+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vrhaddq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vrhaddq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vrhaddq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! -+} -+ -+ -+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! -+} -+ -+ -+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vrhaddq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0 -+{ -+ //no signed average in x86 SIMD, go to unsigned -+ __m128i c128, au, bu, sum; -+ c128 = _mm_set1_epi8(0x80); //-128 -+ au = _mm_sub_epi8(a, c128); //add 128 -+ bu = _mm_sub_epi8(b, c128); //add 128 -+ sum = _mm_avg_epu8(au, bu); -+ return _mm_add_epi8 (sum, c128); //sub 128 -+} -+ -+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0 -+{ -+ //no signed average in x86 SIMD, go to unsigned -+ __m128i cx8000, au, bu, sum; -+ cx8000 = _mm_set1_epi16(0x8000); // - 32768 -+ au = _mm_sub_epi16(a, cx8000); //add 32768 -+ bu = _mm_sub_epi16(b, cx8000); //add 32768 -+ sum = _mm_avg_epu16(au, bu); -+ return _mm_add_epi16 (sum, cx8000); //sub 32768 -+} -+ -+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b) -+{ -+ //need to avoid overflow -+ __m128i a2, b2, res, sum; -+ a2 = _mm_srai_epi32(a,1); //a2=a/2; -+ b2 = _mm_srai_epi32(b,1); // b2=b/2; -+ res = _mm_or_si128(a,b); //for rounding -+ res = _mm_slli_epi32 (res,31); //shift left then back right to -+ res = _mm_srli_epi32 (res,31); //get 1 or zero -+ sum = _mm_add_epi32(a2,b2); -+ return _mm_add_epi32(sum,res); -+} -+ -+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 -+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded -+ -+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0 -+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded -+ -+ -+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0 -+{ -+ //need to avoid overflow -+ __m128i a2, b2, res, sum; -+ a2 = _mm_srli_epi32(a,1); //a2=a/2; -+ b2 = _mm_srli_epi32(b,1); // b2=b/2; -+ res = _mm_or_si128(a,b); //for rounding -+ res = _mm_slli_epi32 (res,31); //shift left then back right to -+ res = _mm_srli_epi32 (res,31); //get 1 or zero -+ sum = _mm_add_epi32(a2,b2); -+ return _mm_add_epi32(sum,res); -+} -+ -+//****************** VQADD: Vector saturating add ************************ -+//************************************************************************ -+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_adds_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_adds_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vqaddq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int64x1_t res; -+ uint64_t a64, b64; -+ a64 = a.m64_u64[0]; -+ b64 = b.m64_u64[0]; -+ res.m64_u64[0] = a64 + b64; -+ a64 = (a64 >> 63) + (~_SIGNBIT64); -+ if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) { -+ res.m64_u64[0] = a64; -+ } -+ return res; -+} -+ -+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_adds_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_adds_epu16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vqaddq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t a64, b64; -+ uint64x1_t res; -+ a64 = a.m64_u64[0]; -+ b64 = b.m64_u64[0]; -+ res.m64_u64[0] = a64 + b64; -+ if (res.m64_u64[0] < a64) { -+ res.m64_u64[0] = ~(uint64_t)0; -+ } -+ return res; -+} -+ -+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 -+#define vqaddq_s8 _mm_adds_epi8 -+ -+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 -+#define vqaddq_s16 _mm_adds_epi16 -+ -+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b) -+{ -+ //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign -+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_; -+ c7fffffff = _mm_set1_epi32(0x7fffffff); -+ res = _mm_add_epi32(a, b); -+ res_sat = _mm_srli_epi32(a, 31); -+ res_sat = _mm_add_epi32(res_sat, c7fffffff); -+ res_xor_a = _mm_xor_si128(res, a); -+ b_xor_a_ = _mm_xor_si128(b, a); -+ res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a); -+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise -+ res_sat = _mm_and_si128(res_xor_a, res_sat); -+ res = _mm_andnot_si128(res_xor_a, res); -+ return _mm_or_si128(res, res_sat); -+} -+ -+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = atmp[0] + btmp[0]; -+ res[1] = atmp[1] + btmp[1]; -+ -+ atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64); -+ atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64); -+ -+ if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) { -+ res[0] = atmp[0]; -+ } -+ if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) { -+ res[1] = atmp[1]; -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 -+#define vqaddq_u8 _mm_adds_epu8 -+ -+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0 -+#define vqaddq_u16 _mm_adds_epu16 -+ -+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b) -+{ -+ __m128i c80000000, cmp, subsum, suba, sum; -+ c80000000 = _mm_set1_epi32 (0x80000000); -+ sum = _mm_add_epi32 (a, b); -+ subsum = _mm_sub_epi32 (sum, c80000000); -+ suba = _mm_sub_epi32 (a, c80000000); -+ cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed -+ return _mm_or_si128 (sum, cmp); //saturation -+} -+ -+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b) -+ { -+ __m128i c80000000, sum, cmp, suba, subsum; -+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); -+ sum = _mm_add_epi64 (a, b); -+ subsum = _mm_sub_epi64 (sum, c80000000); -+ suba = _mm_sub_epi64 (a, c80000000); -+ cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!! -+ return _mm_or_si128 (sum, cmp); //saturation -+ } -+#else -+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+ { -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = atmp[0] + btmp[0]; -+ res[1] = atmp[1] + btmp[1]; -+ if (res[0] < atmp[0]) res[0] = ~(uint64_t)0; -+ if (res[1] < atmp[1]) res[1] = ~(uint64_t)0; -+ return _mm_load_si128((__m128i*)(res)); -+ } -+#endif -+ -+ -+//******************* Vector add high half (truncated) ****************** -+//************************************************************************ -+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sum; -+ sum = _mm_add_epi16 (a, b); -+ sum = _mm_srai_epi16 (sum, 8); -+ sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only -+ return64(sum); -+} -+ -+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0 -+{ -+ int16x4_t res64; -+ __m128i sum; -+ sum = _mm_add_epi32 (a, b); -+ sum = _mm_srai_epi32(sum, 16); -+ sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only -+ return64(sum); -+} -+ -+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b) -+{ -+ int32x2_t res64; -+ __m128i sum; -+ sum = _mm_add_epi64 (a, b); -+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6)); -+ return64(sum); -+} -+ -+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sum; -+ sum = _mm_add_epi16 (a, b); -+ sum = _mm_srli_epi16 (sum, 8); -+ sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only -+ return64(sum); -+} -+ -+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0 -+{ -+ uint16x4_t res64; -+ __m128i sum; -+ sum = _mm_add_epi32 (a, b); -+ sum = _mm_srli_epi32 (sum, 16); -+ sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only -+ return64(sum); -+} -+ -+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 -+#define vaddhn_u64 vaddhn_s64 -+ -+//*********** Vector rounding add high half: vraddhn_ ******************. -+//*************************************************************************** -+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sum = _mm_srai_epi16 (sum, 8); //get high half -+ sum = _mm_add_epi16 (sum, mask1); //actual rounding -+ sum = _mm_packs_epi16 (sum, sum); -+ return64(sum); -+} -+ -+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0 -+{ -+ //SIMD may be not optimal, serial may be faster -+ int16x4_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sum = _mm_srai_epi32 (sum, 16); //get high half -+ sum = _mm_add_epi32 (sum, mask1); //actual rounding -+ sum = _mm_packs_epi32 (sum, sum); -+ return64(sum); -+} -+ -+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b) -+{ -+ //SIMD may be not optimal, serial may be faster -+ int32x2_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi64 (a, b); -+ mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero -+ sum = _mm_add_epi64 (sum, mask1); //actual high half rounding -+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6)); -+ return64(sum); -+} -+ -+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sum = _mm_srai_epi16 (sum, 8); //get high half -+ sum = _mm_add_epi16 (sum, mask1); //actual rounding -+ sum = _mm_packus_epi16 (sum, sum); -+ return64(sum); -+} -+ -+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b) -+{ -+ //SIMD may be not optimal, serial may be faster -+ uint16x4_t res64; -+ __m128i sum, mask1; -+ sum = _mm_add_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sum = _mm_srai_epi32 (sum, 16); //get high half -+ sum = _mm_add_epi32 (sum, mask1); //actual rounding -+ sum = _MM_PACKUS1_EPI32 (sum); -+ return64(sum); -+} -+ -+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 -+#define vraddhn_u64 vraddhn_s64 -+ -+//********************************************************************************** -+//********* Multiplication ************************************* -+//************************************************************************************** -+ -+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] -+//As we don't go to wider result functions are equal to "multiply low" in x86 -+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits in SSE -+ int8x8_t res64; -+ __m128i a128, b128, res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits -+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits -+ res = _mm_mullo_epi16 (a128, b128); -+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only -+ return64(res); -+} -+ -+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 -+#define vmul_s16 vmul_u16 -+ -+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 -+#define vmul_s32 vmul_u32 -+ -+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x4_t tmp; -+ __m64_128 res64; -+ tmp = _mm_mul_ps(_pM128(a),_pM128(b)); -+ _M64f(res64, tmp); //use low 64 bits -+ return res64; -+} -+ -+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits in SSE -+ uint8x8_t res64; -+ __m128i mask, a128, b128, res; -+ mask = _mm_set1_epi16(0xff); -+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); -+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); -+ res = _mm_mullo_epi16 (a128, b128); -+ res = _mm_and_si128(res, mask); //to avoid saturation -+ res = _mm_packus_epi16 (res,res); //use only low 64 bits -+ return64(res); -+} -+ -+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0]; -+ res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1]; -+ return res; -+} -+ -+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 -+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b) -+{ -+ //may be optimized -+ poly8x8_t res64; -+ __m128i a64, b64, c1, res, tmp, bmasked; -+ int i; -+ a64 = _pM128i(a); -+ b64 = _pM128i(b); -+ c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff.... -+ c1 = vshrq_n_u8(c1,7); //0x1 -+ bmasked = _mm_and_si128(b64, c1); //0x1 -+ res = vmulq_u8(a64, bmasked); -+ for(i = 1; i<8; i++) { -+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here -+ bmasked = _mm_and_si128(b64, c1); //0x1 -+ tmp = vmulq_u8(a64, bmasked); -+ res = _mm_xor_si128(res, tmp); -+ } -+ return64 (res); -+} -+ -+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits -+ //solution may be not optimal -+ __m128i a16, b16, r16_1, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (a16, b16); -+ //swap hi and low part of a and b to process the remaining data -+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2 -+ -+ r16_2 = _mm_mullo_epi16 (a16, b16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit -+ -+ return _mm_unpacklo_epi64(r16_1, r16_2); -+} -+ -+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 -+#define vmulq_s16 _mm_mullo_epi16 -+ -+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 -+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1 -+ -+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 -+#define vmulq_f32 _mm_mul_ps -+ -+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits -+ //solution may be not optimal -+ __m128i maskff, a16, b16, r16_1, r16_2; -+ maskff = _mm_set1_epi16(0xff); -+ a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1 -+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (a16, b16); -+ r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation -+ //swap hi and low part of a and b to process the remaining data -+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (a16, b16); -+ r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation -+ return _mm_packus_epi16 (r16_1, r16_2); -+} -+ -+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 -+#define vmulq_u16 _mm_mullo_epi16 -+ -+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 -+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1 -+ -+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 -+_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b) -+{ -+ //may be optimized -+ __m128i c1, res, tmp, bmasked; -+ int i; -+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... -+ c1 = vshrq_n_u8(c1,7); //0x1 -+ bmasked = _mm_and_si128(b, c1); //0x1 -+ res = vmulq_u8(a, bmasked); -+ for(i = 1; i<8; i++) { -+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here -+ bmasked = _mm_and_si128(b, c1); //0x1 -+ tmp = vmulq_u8(a, bmasked); -+ res = _mm_xor_si128(res, tmp); -+ } -+ return res; -+} -+ -+//************************* Vector long multiply *********************************** -+//**************************************************************************** -+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0 -+{ -+ //no 8 bit simd multiply, need to go to 16 bits -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 -+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit -+} -+ -+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0 -+{ -+ #ifdef USE_SSE4 -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1 -+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 -+ #else -+ __m128i low, hi, a128,b128; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ low = _mm_mullo_epi16(a128,b128); -+ hi = _mm_mulhi_epi16(a128,b128); -+ return _mm_unpacklo_epi16(low,hi); -+ #endif -+} -+ -+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0 -+{ -+ __m128i ab, ba, a128, b128; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 -+ return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+} -+ -+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0 -+{ -+ //no 8 bit simd multiply, need to go to 16 bits -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 -+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit -+} -+ -+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0 -+{ -+ #ifdef USE_SSE4 -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1 -+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 -+ #else -+ __m128i a128,b128,low, hi; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ low = _mm_mullo_epi16(a128,b128); -+ hi = _mm_mulhi_epu16(a128,b128); -+ return _mm_unpacklo_epi16(low,hi); -+ #endif -+} -+ -+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0 -+{ -+ ///may be not optimal compared with serial implementation -+ __m128i ab, ba, a128, b128; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 -+ return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+} -+ -+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 -+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) -+{ -+ //may be optimized -+ __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked; -+ int i; -+ a128 = _pM128i(a); -+ b128 = _pM128i(b); -+ c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff.... -+ c1 = vshrq_n_u8(c1,7); //0x1 -+ bmasked = _mm_and_si128(b128, c1); //0x1 -+ -+ a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1 -+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 -+ res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit -+ for(i = 1; i<8; i++) { -+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here -+ bmasked = _mm_and_si128(b128, c1); //0x1 -+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 -+ tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked); -+ res = _mm_xor_si128(res, tmp); -+ } -+ return res; -+} -+ -+//****************Vector saturating doubling long multiply ************************** -+//***************************************************************** -+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b) -+{ -+ //the serial soulution may be faster due to saturation -+ __m128i res; -+ res = vmull_s16(a, b); -+ return vqd_s32(res); -+} -+ -+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //the serial soulution may be faster due to saturation -+ __m128i res; -+ res = vmull_s32(a,b); -+ return vqaddq_s64(res,res); //slow serial function!!!! -+} -+ -+//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************ -+//****************************************************************************************** -+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0 -+{ -+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits -+ int8x8_t res64; -+ __m128i b128, c128, res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits -+ c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits -+ res = _mm_mullo_epi16 (c128, b128); -+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); -+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits -+ return64(res); -+} -+ -+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) -+{ -+ int16x4_t res64; -+ return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); -+} -+ -+ -+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0 -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1 -+ res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits -+ return64(res); -+} -+ -+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) -+{ -+ //fma is coming soon, but right now: -+ __m128 res; -+ __m64_128 res64; -+ res = _mm_mul_ps (_pM128(c), _pM128(b)); -+ res = _mm_add_ps (_pM128(a), res); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0 -+{ -+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits -+ uint8x8_t res64; -+ __m128i mask, b128, c128, res; -+ mask = _mm_set1_epi16(0xff); -+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits -+ c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits -+ res = _mm_mullo_epi16 (c128, b128); -+ res = _mm_and_si128(res, mask); //to avoid saturation -+ res = _mm_packus_epi16 (res, res); -+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits -+ return64(res); -+} -+ -+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 -+#define vmla_u16 vmla_s16 -+ -+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 -+#define vmla_u32 vmla_s32 -+ -+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2,r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits -+ r16_1 = _mm_add_epi8 (r16_1, a); -+ //swap hi and low part of a, b and c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_add_epi8(r16_2, a_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_mullo_epi16 (c, b); -+ return _mm_add_epi16 (res, a); -+} -+ -+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0 -+{ -+ __m128i res; -+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 -+ return _mm_add_epi32 (res, a); -+} -+ -+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0 -+{ -+ //fma is coming soon, but right now: -+ __m128 res; -+ res = _mm_mul_ps (c, b); -+ return _mm_add_ps (a, res); -+} -+ -+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits -+ r16_1 = _mm_add_epi8 (r16_1, a); -+ //swap hi and low part of a, b and c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_add_epi8(r16_2, a_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 -+#define vmlaq_u16 vmlaq_s16 -+ -+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 -+#define vmlaq_u32 vmlaq_s32 -+ -+//********************** Vector widening multiply accumulate (long multiply accumulate): -+// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************** -+//******************************************************************************************** -+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0 -+{ -+ int16x8_t res; -+ res = vmull_s8(b, c); -+ return _mm_add_epi16 (res, a); -+} -+ -+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int32x4_t res; -+ res = vmull_s16(b, c); -+ return _mm_add_epi32 (res, a); -+} -+ -+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_s32( b, c); -+ return _mm_add_epi64 (res, a); -+} -+ -+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0 -+{ -+ uint16x8_t res; -+ res = vmull_u8(b, c); -+ return _mm_add_epi16 (res, a); -+} -+ -+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ uint32x4_t res; -+ res = vmull_u16(b, c); -+ return _mm_add_epi32 (res, a); -+} -+ -+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_u32( b,c); -+ return _mm_add_epi64 (res, a); -+} -+ -+//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] *************************************** -+//******************************************************************************************** -+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0 -+{ -+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits -+ int8x8_t res64; -+ __m128i res; -+ res64 = vmul_s8(b,c); -+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); -+ return64(res); -+} -+ -+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) -+{ -+ int16x4_t res64; -+ return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); -+} -+ -+ -+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0 -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1 -+ res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only -+ return64(res); -+} -+ -+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) -+{ -+ __m128 res; -+ __m64_128 res64; -+ res = _mm_mul_ps (_pM128(c), _pM128(b)); -+ res = _mm_sub_ps (_pM128(a), res); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) -+{ -+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits -+ uint8x8_t res64; -+ __m128i res; -+ res64 = vmul_u8(b,c); -+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); -+ return64(res); -+} -+ -+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 -+#define vmls_u16 vmls_s16 -+ -+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 -+#define vmls_u32 vmls_s32 -+ -+ -+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); -+ r16_1 = _mm_sub_epi8 (a, r16_1); -+ //swap hi and low part of a, b, c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_sub_epi8 (a_2, r16_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_mullo_epi16 (c, b); -+ return _mm_sub_epi16 (a, res); -+} -+ -+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0 -+{ -+ __m128i res; -+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1 -+ return _mm_sub_epi32 (a, res); -+} -+ -+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0 -+{ -+ __m128 res; -+ res = _mm_mul_ps (c, b); -+ return _mm_sub_ps (a, res); -+} -+ -+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0 -+{ -+ //solution may be not optimal -+ // no 8 bit simd multiply, need to go to 16 bits -+ __m128i b16, c16, r16_1, a_2, r16_2; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 -+ r16_1 = _mm_mullo_epi16 (b16, c16); -+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits -+ r16_1 = _mm_sub_epi8 (a, r16_1); -+ //swap hi and low part of a, b and c to process the remaining data -+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); -+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); -+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 -+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 -+ -+ r16_2 = _mm_mullo_epi16 (b16, c16); -+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); -+ r16_2 = _mm_sub_epi8(a_2, r16_2); -+ return _mm_unpacklo_epi64(r16_1,r16_2); -+} -+ -+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 -+#define vmlsq_u16 vmlsq_s16 -+ -+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 -+#define vmlsq_u32 vmlsq_s32 -+ -+//******************** Vector multiply subtract long (widening multiply subtract) ************************************ -+//************************************************************************************************************* -+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0 -+{ -+ int16x8_t res; -+ res = vmull_s8(b, c); -+ return _mm_sub_epi16 (a, res); -+} -+ -+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int32x4_t res; -+ res = vmull_s16(b, c); -+ return _mm_sub_epi32 (a, res); -+} -+ -+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_s32( b,c); -+ return _mm_sub_epi64 (a, res); -+} -+ -+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0 -+{ -+ uint16x8_t res; -+ res = vmull_u8(b, c); -+ return _mm_sub_epi16 (a, res); -+} -+ -+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ uint32x4_t res; -+ res = vmull_u16(b, c); -+ return _mm_sub_epi32 (a, res); -+} -+ -+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0 -+{ -+ //may be not optimal compared with serial implementation -+ int64x2_t res; -+ res = vmull_u32( b,c); -+ return _mm_sub_epi64 (a, res); -+} -+ -+//****** Vector saturating doubling multiply high ********************** -+//************************************************************************* -+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int16x4_t res; -+ int32_t a32, b32, i; -+ for (i = 0; i<4; i++) { -+ a32 = (int32_t) a.m64_i16[i]; -+ b32 = (int32_t) b.m64_i16[i]; -+ a32 = (a32 * b32) >> 15; -+ res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32; -+ } -+ return res; -+} -+ -+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster -+{ -+ //may be not optimal compared with a serial solution -+ int32x2_t res64; -+ __m128i mask; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ int64x2_t mul; -+ mul = vmull_s32(a,b); -+ mul = _mm_slli_epi64(mul,1); //double the result -+ //at this point start treating 2 64-bit numbers as 4 32-bit -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+ return64(mul); -+} -+ -+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0 -+{ -+ __m128i res, res_lo, mask; -+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; -+ res = _mm_mulhi_epi16 (a, b); -+ res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation -+ res_lo = _mm_mullo_epi16 (a, b); -+ res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit -+ res = _mm_add_epi16(res, res_lo); //combine results -+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); -+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 -+} -+ -+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target -+ __m128i ab, ba, mask, mul, mul1; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 -+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul = _mm_slli_epi64(mul,1); //double the result -+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 -+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 -+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul1 = _mm_slli_epi64(mul1,1); //double the result -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits -+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits -+ mul = _mm_unpacklo_epi64(mul, mul1); -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+} -+ -+//********* Vector saturating rounding doubling multiply high **************** -+//**************************************************************************** -+//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order -+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //may be not optimal compared with a serial solution -+ int32x2_t res64; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ __m128i res_sat, mask, mask1; -+ int64x2_t mul; -+ mul = vmull_s32(a,b); -+ res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered -+ mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero -+ mul = _mm_add_epi32 (res_sat, mask1); //actual rounding -+ //at this point start treating 2 64-bit numbers as 4 32-bit -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+ return64(mul); -+} -+ -+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0 -+{ -+ __m128i mask, res; -+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; -+ res = _mm_mulhrs_epi16 (a, b); -+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); -+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000 -+} -+ -+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target -+ __m128i ab, ba, mask, mul, mul1, mask1; -+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 -+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 -+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered -+ mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero -+ mul = _mm_add_epi32 (mul, mask1); //actual rounding -+ -+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 -+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 -+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result -+ mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered -+ mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero -+ mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding -+ //at this point start treating 2 64-bit numbers as 4 32-bit -+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit -+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit -+ mul = _mm_unpacklo_epi64(mul, mul1); -+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); -+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 -+} -+ -+//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) ***** -+//************************************************************************************************************************* -+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0 -+{ -+ //not optimal SIMD soulution, serial may be faster -+ __m128i res32; -+ res32 = vmull_s16(b, c); -+ res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1); -+ return vqaddq_s32(res32, a); //saturation -+} -+ -+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res64; -+ res64 = vmull_s32(b,c); -+ res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1); -+ return vqaddq_s64(res64, a); //saturation -+} -+ -+//************************************************************************************ -+//****************** Vector subtract *********************************************** -+//************************************************************************************ -+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_sub_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_sub_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_sub_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res64; -+ res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0]; -+ return res64; -+} -+ -+ -+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0]; -+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1]; -+ return res; -+} -+ -+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 -+#define vsub_u8 vsub_s8 -+ -+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 -+#define vsub_u16 vsub_s16 -+ -+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 -+#define vsub_u32 vsub_s32 -+ -+ -+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 -+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b) -+{ -+ int64x1_t res64; -+ res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0]; -+ return res64; -+} -+ -+ -+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 -+#define vsubq_s8 _mm_sub_epi8 -+ -+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 -+#define vsubq_s16 _mm_sub_epi16 -+ -+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 -+#define vsubq_s32 _mm_sub_epi32 -+ -+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 -+#define vsubq_s64 _mm_sub_epi64 -+ -+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 -+#define vsubq_f32 _mm_sub_ps -+ -+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 -+#define vsubq_u8 _mm_sub_epi8 -+ -+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 -+#define vsubq_u16 _mm_sub_epi16 -+ -+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 -+#define vsubq_u32 _mm_sub_epi32 -+ -+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 -+#define vsubq_u64 _mm_sub_epi64 -+ -+//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ****************** -+//*********************************************************************************** -+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. -+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a16, b16); -+} -+ -+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a32, b32); -+} -+ -+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 -+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi64 (a64, b64); -+} -+ -+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a16, b16); -+} -+ -+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a32, b32); -+} -+ -+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 -+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0 -+{ -+ //may be not optimal -+ __m128i a64, b64; -+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi64 (a64, b64); -+} -+ -+//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ********************************** -+//***************************************************************************************************** -+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 -+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a, b16); -+} -+ -+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 -+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a, b32); -+} -+ -+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 -+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_sub_epi64 (a, b64); -+} -+ -+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 -+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0 -+{ -+ __m128i b16; -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi16 (a, b16); -+} -+ -+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0 -+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0 -+{ -+ __m128i b32; -+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, -+ return _mm_sub_epi32 (a, b32); -+} -+ -+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 -+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0 -+{ -+ __m128i b64; -+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 -+ return _mm_sub_epi64 (a, b64); -+} -+ -+//************************Vector saturating subtract ********************************* -+//************************************************************************************* -+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_subs_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_subs_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vqsubq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution -+{ -+ uint64x1_t res; -+ uint64_t a64,b64; -+ a64 = a.m64_u64[0]; -+ b64 = b.m64_u64[0]; -+ res.m64_u64[0] = a64 - b64; -+ -+ a64 = (a64 >> 63) + (~_SIGNBIT64); -+ if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) { -+ res.m64_u64[0] = a64; -+ } -+ return res; -+} -+ -+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_subs_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_subs_epu16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vqsubq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint64x1_t res; -+ uint64_t a64, b64; -+ a64 = _Ui64(a); -+ b64 = _Ui64(b); -+ if (a64 > b64) { -+ res.m64_u64[0] = a64 - b64; -+ } else { -+ res.m64_u64[0] = 0; -+ } -+ return res; -+} -+ -+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 -+#define vqsubq_s8 _mm_subs_epi8 -+ -+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 -+#define vqsubq_s16 _mm_subs_epi16 -+ -+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b) -+{ -+ //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a -+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a; -+ c7fffffff = _mm_set1_epi32(0x7fffffff); -+ res = _mm_sub_epi32(a, b); -+ res_sat = _mm_srli_epi32(a, 31); -+ res_sat = _mm_add_epi32(res_sat, c7fffffff); -+ res_xor_a = _mm_xor_si128(res, a); -+ b_xor_a = _mm_xor_si128(b, a); -+ res_xor_a = _mm_and_si128(b_xor_a, res_xor_a); -+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise -+ res_sat = _mm_and_si128(res_xor_a, res_sat); -+ res = _mm_andnot_si128(res_xor_a, res); -+ return _mm_or_si128(res, res_sat); -+} -+ -+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution -+{ -+ _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2]; -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = atmp[0] - btmp[0]; -+ res[1] = atmp[1] - btmp[1]; -+ if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) { -+ res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64; -+ } -+ if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) { -+ res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64; -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 -+#define vqsubq_u8 _mm_subs_epu8 -+ -+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0 -+#define vqsubq_u16 _mm_subs_epu16 -+ -+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0 -+{ -+ __m128i min, mask, sub; -+ min = _MM_MIN_EPU32(a, b); //SSE4.1 -+ mask = _mm_cmpeq_epi32 (min, b); -+ sub = _mm_sub_epi32 (a, b); -+ return _mm_and_si128 ( sub, mask); -+} -+ -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b) -+ { -+ __m128i c80000000, subb, suba, cmp, sub; -+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); -+ sub = _mm_sub_epi64 (a, b); -+ suba = _mm_sub_epi64 (a, c80000000); -+ subb = _mm_sub_epi64 (b, c80000000); -+ cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!! -+ return _mm_and_si128 (sub, cmp); //saturation -+ } -+#else -+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+ { -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ _mm_store_si128((__m128i*)btmp, b); -+ res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0; -+ res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0; -+ return _mm_load_si128((__m128i*)(res)); -+ } -+#endif -+ -+//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ****************************************************** -+//**************************************************************** -+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0 -+{ -+ //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit, -+ int8x8_t res64; -+ __m128i r16; -+ int8x8_t r; -+ r = vsub_s8 (a, b); -+ r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1 -+ r16 = _mm_srai_epi16 (r16, 1); //SSE2 -+ r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits -+ return64(r16); -+} -+ -+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vhsubq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+ -+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vhsubq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vhsubq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vhsubq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vhsubq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0 -+{ -+ // //need to deal with the possibility of internal overflow -+ __m128i c128, au,bu; -+ c128 = _mm_set1_epi8 (128); -+ au = _mm_add_epi8( a, c128); -+ bu = _mm_add_epi8( b, c128); -+ return vhsubq_u8(au,bu); -+} -+ -+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0 -+{ -+ //need to deal with the possibility of internal overflow -+ __m128i c8000, au,bu; -+ c8000 = _mm_set1_epi16(0x8000); -+ au = _mm_add_epi16( a, c8000); -+ bu = _mm_add_epi16( b, c8000); -+ return vhsubq_u16(au,bu); -+} -+ -+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0 -+{ -+ //need to deal with the possibility of internal overflow -+ __m128i a2, b2,r, b_1; -+ a2 = _mm_srai_epi32 (a,1); -+ b2 = _mm_srai_epi32 (b,1); -+ r = _mm_sub_epi32 (a2, b2); -+ b_1 = _mm_andnot_si128(a, b); //!a and b -+ b_1 = _mm_slli_epi32 (b_1,31); -+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit -+ return _mm_sub_epi32(r,b_1); -+} -+ -+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0 -+{ -+ __m128i avg; -+ avg = _mm_avg_epu8 (a, b); -+ return _mm_sub_epi8(a, avg); -+} -+ -+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0 -+{ -+ __m128i avg; -+ avg = _mm_avg_epu16 (a, b); -+ return _mm_sub_epi16(a, avg); -+} -+ -+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0 -+{ -+ //need to deal with the possibility of internal overflow -+ __m128i a2, b2,r, b_1; -+ a2 = _mm_srli_epi32 (a,1); -+ b2 = _mm_srli_epi32 (b,1); -+ r = _mm_sub_epi32 (a2, b2); -+ b_1 = _mm_andnot_si128(a, b); //!a and b -+ b_1 = _mm_slli_epi32 (b_1,31); -+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit -+ return _mm_sub_epi32(r,b_1); -+} -+ -+//******* Vector subtract high half (truncated) ** ************ -+//************************************************************ -+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sum, sum8; -+ sum = _mm_sub_epi16 (a, b); -+ sum8 = _mm_srai_epi16 (sum, 8); -+ sum8 = _mm_packs_epi16(sum8,sum8); -+ return64(sum8); -+} -+ -+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0 -+{ -+ int16x4_t res64; -+ __m128i sum, sum16; -+ sum = _mm_sub_epi32 (a, b); -+ sum16 = _mm_srai_epi32 (sum, 16); -+ sum16 = _mm_packs_epi32(sum16,sum16); -+ return64(sum16); -+} -+ -+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b) -+{ -+ int32x2_t res64; -+ __m128i sub; -+ sub = _mm_sub_epi64 (a, b); -+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); -+ return64(sub); -+} -+ -+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sum, sum8; -+ sum = _mm_sub_epi16 (a, b); -+ sum8 = _mm_srli_epi16 (sum, 8); -+ sum8 = _mm_packus_epi16(sum8,sum8); -+ return64(sum8); -+} -+ -+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0 -+{ -+ uint16x4_t res64; -+ __m128i sum, sum16; -+ sum = _mm_sub_epi32 (a, b); -+ sum16 = _mm_srli_epi32 (sum, 16); -+ sum16 = _MM_PACKUS1_EPI32(sum16); -+ return64(sum16); -+} -+ -+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 -+#define vsubhn_u64 vsubhn_s64 -+ -+//************ Vector rounding subtract high half ********************* -+//********************************************************************* -+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0 -+{ -+ int8x8_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sub = _mm_srai_epi16 (sub, 8); //get high half -+ sub = _mm_add_epi16 (sub, mask1); //actual rounding -+ sub = _mm_packs_epi16 (sub, sub); -+ return64(sub); -+} -+ -+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0 -+{ -+ //SIMD may be not optimal, serial may be faster -+ int16x4_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sub = _mm_srai_epi32 (sub, 16); //get high half -+ sub = _mm_add_epi32 (sub, mask1); //actual rounding -+ sub = _mm_packs_epi32 (sub, sub); -+ return64(sub); -+} -+ -+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b) -+{ -+ //SIMD may be not optimal, serial may be faster -+ int32x2_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi64 (a, b); -+ mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to -+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero -+ sub = _mm_add_epi64 (sub, mask1); //actual high half rounding -+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); -+ return64(sub); -+} -+ -+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 -+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0 -+{ -+ uint8x8_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi16 (a, b); -+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to -+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero -+ sub = _mm_srai_epi16 (sub, 8); //get high half -+ sub = _mm_add_epi16 (sub, mask1); //actual rounding -+ sub = _mm_packus_epi16 (sub, sub); -+ return64(sub); -+} -+ -+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 -+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0 -+{ -+ //SIMD may be not optimal, serial may be faster -+ uint16x4_t res64; -+ __m128i sub, mask1; -+ sub = _mm_sub_epi32 (a, b); -+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to -+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero -+ sub = _mm_srai_epi32 (sub, 16); //get high half -+ sub = _mm_add_epi32 (sub, mask1); //actual rounding -+ sub = _MM_PACKUS1_EPI32 (sub); -+ return64(sub); -+} -+ -+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 -+#define vrsubhn_u64 vrsubhn_s64 -+ -+//*********** Vector saturating doubling multiply subtract long ******************** -+//************************************************************************************ -+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) -+{ -+ //not optimal SIMD soulution, serial may be faster -+ __m128i res32, mask; -+ int32x4_t res; -+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ res = vmull_s16(b, c); -+ res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered -+ mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask); -+ res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000 -+ return vqsubq_s32(a, res32); //saturation -+} -+ -+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res64, mask; -+ int64x2_t res; -+ _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000}; -+ res = vmull_s32(b, c); -+ res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered -+ mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask); -+ res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000 -+ return vqsubq_s64(a, res64); //saturation -+} -+ -+//****************** COMPARISON *************************************** -+//******************* Vector compare equal ************************************* -+//**************************************************************************** -+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmpeq_ps(_pM128(a), _pM128(b) ); -+ return64f(res); -+} -+ -+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 -+#define vceq_p8 vceq_u8 -+ -+ -+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 -+#define vceqq_s8 _mm_cmpeq_epi8 -+ -+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 -+#define vceqq_s16 _mm_cmpeq_epi16 -+ -+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 -+#define vceqq_s32 _mm_cmpeq_epi32 -+ -+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmpeq_ps(a,b); -+ return _M128i(res); -+} -+ -+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 -+#define vceqq_u8 _mm_cmpeq_epi8 -+ -+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 -+#define vceqq_u16 _mm_cmpeq_epi16 -+ -+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 -+#define vceqq_u32 _mm_cmpeq_epi32 -+ -+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 -+#define vceqq_p8 _mm_cmpeq_epi8 -+ -+//******************Vector compare greater-than or equal************************* -+//******************************************************************************* -+//in IA SIMD no greater-than-or-equal comparison for integers, -+// there is greater-than available only, so we need the following tricks -+ -+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vcgeq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vcgeq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vcgeq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries -+ return64f(res); -+} -+ -+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vcgeq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vcgeq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b) -+{ -+ //serial solution looks faster -+ uint32x2_t res64; -+ return64(vcgeq_u32 (_pM128i(a), _pM128i(b))); -+} -+ -+ -+ -+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 -+{ -+ __m128i m1, m2; -+ m1 = _mm_cmpgt_epi8 ( a, b); -+ m2 = _mm_cmpeq_epi8 ( a, b); -+ return _mm_or_si128 ( m1, m2); -+} -+ -+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 -+{ -+ __m128i m1, m2; -+ m1 = _mm_cmpgt_epi16 ( a, b); -+ m2 = _mm_cmpeq_epi16 ( a, b); -+ return _mm_or_si128 ( m1,m2); -+} -+ -+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 -+{ -+ __m128i m1, m2; -+ m1 = _mm_cmpgt_epi32 (a, b); -+ m2 = _mm_cmpeq_epi32 (a, b); -+ return _mm_or_si128 (m1, m2); -+} -+ -+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmpge_ps(a,b); //use only 2 first entries -+ return *(__m128i*)&res; -+} -+ -+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 -+{ -+ //no unsigned chars comparison, only signed available,so need the trick -+ #ifdef USE_SSE4 -+ __m128i cmp; -+ cmp = _mm_max_epu8(a, b); -+ return _mm_cmpeq_epi8(cmp, a); //a>=b -+ #else -+ __m128i c128, as, bs, m1, m2; -+ c128 = _mm_set1_epi8 (128); -+ as = _mm_sub_epi8( a, c128); -+ bs = _mm_sub_epi8( b, c128); -+ m1 = _mm_cmpgt_epi8( as, bs); -+ m2 = _mm_cmpeq_epi8 (as, bs); -+ return _mm_or_si128 ( m1, m2); -+ #endif -+} -+ -+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 -+{ -+ //no unsigned shorts comparison, only signed available,so need the trick -+ #ifdef USE_SSE4 -+ __m128i cmp; -+ cmp = _mm_max_epu16(a, b); -+ return _mm_cmpeq_epi16(cmp, a); //a>=b -+ #else -+ __m128i c8000, as, bs, m1, m2; -+ c8000 = _mm_set1_epi16 (0x8000); -+ as = _mm_sub_epi16(a,c8000); -+ bs = _mm_sub_epi16(b,c8000); -+ m1 = _mm_cmpgt_epi16(as, bs); -+ m2 = _mm_cmpeq_epi16 (as, bs); -+ return _mm_or_si128 ( m1, m2); -+ #endif -+} -+ -+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 -+{ -+ //no unsigned ints comparison, only signed available,so need the trick -+ #ifdef USE_SSE4 -+ __m128i cmp; -+ cmp = _mm_max_epu32(a, b); -+ return _mm_cmpeq_epi32(cmp, a); //a>=b -+ #else -+ //serial solution may be faster -+ __m128i c80000000, as, bs, m1, m2; -+ c80000000 = _mm_set1_epi32 (0x80000000); -+ as = _mm_sub_epi32(a,c80000000); -+ bs = _mm_sub_epi32(b,c80000000); -+ m1 = _mm_cmpgt_epi32 (as, bs); -+ m2 = _mm_cmpeq_epi32 (as, bs); -+ return _mm_or_si128 ( m1, m2); -+ #endif -+} -+ -+//**********************Vector compare less-than or equal****************************** -+//*************************************************************************************** -+//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks -+ -+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vcleq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vcleq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vcleq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0? -+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmple_ps(_pM128(a),_pM128(b)); -+ return64f(res); -+} -+ -+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 -+#define vcle_u8(a,b) vcge_u8(b,a) -+ -+ -+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 -+#define vcle_u16(a,b) vcge_u16(b,a) -+ -+ -+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 -+#define vcle_u32(a,b) vcge_u32(b,a) -+ -+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 -+{ -+ __m128i c1, res; -+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... -+ res = _mm_cmpgt_epi8 ( a, b); -+ return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal -+} -+ -+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 -+{ -+ __m128i c1, res; -+ c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff.... -+ res = _mm_cmpgt_epi16 ( a, b); -+ return _mm_andnot_si128 (res, c1); -+} -+ -+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 -+{ -+ __m128i c1, res; -+ c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff.... -+ res = _mm_cmpgt_epi32 ( a, b); -+ return _mm_andnot_si128 (res, c1); -+} -+ -+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmple_ps(a,b); -+ return *(__m128i*)&res; -+} -+ -+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 -+ { -+ //no unsigned chars comparison in SSE, only signed available,so need the trick -+ __m128i cmp; -+ cmp = _mm_min_epu8(a, b); -+ return _mm_cmpeq_epi8(cmp, a); //a<=b -+ } -+#else -+ #define vcleq_u8(a,b) vcgeq_u8(b,a) -+#endif -+ -+ -+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 -+ { -+ //no unsigned shorts comparison in SSE, only signed available,so need the trick -+ __m128i cmp; -+ cmp = _mm_min_epu16(a, b); -+ return _mm_cmpeq_epi16(cmp, a); //a<=b -+ } -+#else -+ #define vcleq_u16(a,b) vcgeq_u16(b,a) -+#endif -+ -+ -+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 -+#ifdef USE_SSE4 -+ _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 -+ { -+ //no unsigned chars comparison in SSE, only signed available,so need the trick -+ __m128i cmp; -+ cmp = _mm_min_epu32(a, b); -+ return _mm_cmpeq_epi32(cmp, a); //a<=b -+ } -+#else -+//solution may be not optimal compared with the serial one -+ #define vcleq_u32(a,b) vcgeq_u32(b,a) -+#endif -+ -+ -+//****** Vector compare greater-than ****************************************** -+//************************************************************************** -+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128 res; -+ res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries -+ return64f(res); -+} -+ -+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vcgtq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vcgtq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vcgtq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+#define vcgtq_s8 _mm_cmpgt_epi8 -+ -+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+#define vcgtq_s16 _mm_cmpgt_epi16 -+ -+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+#define vcgtq_s32 _mm_cmpgt_epi32 -+ -+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b) -+{ -+ __m128 res; -+ res = _mm_cmpgt_ps(a,b); //use only 2 first entries -+ return *(__m128i*)&res; -+} -+ -+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0 -+{ -+ //no unsigned chars comparison, only signed available,so need the trick -+ __m128i c128, as, bs; -+ c128 = _mm_set1_epi8 (128); -+ as = _mm_sub_epi8(a,c128); -+ bs = _mm_sub_epi8(b,c128); -+ return _mm_cmpgt_epi8 (as, bs); -+} -+ -+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0 -+{ -+ //no unsigned short comparison, only signed available,so need the trick -+ __m128i c8000, as, bs; -+ c8000 = _mm_set1_epi16 (0x8000); -+ as = _mm_sub_epi16(a,c8000); -+ bs = _mm_sub_epi16(b,c8000); -+ return _mm_cmpgt_epi16 ( as, bs); -+} -+ -+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0 -+{ -+ //no unsigned int comparison, only signed available,so need the trick -+ __m128i c80000000, as, bs; -+ c80000000 = _mm_set1_epi32 (0x80000000); -+ as = _mm_sub_epi32(a,c80000000); -+ bs = _mm_sub_epi32(b,c80000000); -+ return _mm_cmpgt_epi32 ( as, bs); -+} -+ -+//********************* Vector compare less-than ************************** -+//************************************************************************* -+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 -+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!! -+ -+ -+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 -+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!! -+ -+ -+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 -+#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!! -+ -+ -+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 -+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!! -+ -+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 -+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!! -+ -+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 -+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!! -+ -+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 -+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!! -+ -+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 -+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!! -+ -+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 -+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!! -+ -+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 -+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!! -+ -+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 -+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!! -+ -+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 -+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!! -+ -+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 -+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!! -+ -+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 -+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!! -+ -+//*****************Vector compare absolute greater-than or equal ************ -+//*************************************************************************** -+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmpge_ps ( a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmpge_ps ( a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//********Vector compare absolute less-than or equal ****************** -+//******************************************************************** -+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmple_ps (a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmple_ps (a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//******** Vector compare absolute greater-than ****************** -+//****************************************************************** -+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmpgt_ps (a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmpgt_ps (a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//***************Vector compare absolute less-than *********************** -+//************************************************************************* -+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); -+ a0 = _mm_cmplt_ps (a0, b0); -+ return64f(a0); -+} -+ -+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 -+{ -+ __m128i c7fffffff; -+ __m128 a0, b0; -+ c7fffffff = _mm_set1_epi32 (0x7fffffff); -+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); -+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); -+ a0 = _mm_cmplt_ps (a0, b0); -+ return (*(__m128i*)&a0); -+} -+ -+//*************************Vector test bits************************************ -+//***************************************************************************** -+/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them -+with the corresponding element of a second vector. If the result is not zero, the -+corresponding element in the destination vector is set to all ones. Otherwise, it is set to -+all zeros. */ -+ -+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 -+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vtstq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 -+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vtstq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 -+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vtstq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 -+#define vtst_u8 vtst_s8 -+ -+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 -+#define vtst_u16 vtst_s16 -+ -+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 -+#define vtst_u32 vtst_s32 -+ -+ -+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 -+#define vtst_p8 vtst_u8 -+ -+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 -+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0 -+{ -+ __m128i zero, one, res; -+ zero = _mm_setzero_si128 (); -+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff -+ res = _mm_and_si128 (a, b); -+ res = _mm_cmpeq_epi8 (res, zero); -+ return _mm_xor_si128(res, one); //invert result -+} -+ -+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 -+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0 -+{ -+ __m128i zero, one, res; -+ zero = _mm_setzero_si128 (); -+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff -+ res = _mm_and_si128 (a, b); -+ res = _mm_cmpeq_epi16 (res, zero); -+ return _mm_xor_si128(res, one); //invert result -+} -+ -+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 -+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0 -+{ -+ __m128i zero, one, res; -+ zero = _mm_setzero_si128 (); -+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff -+ res = _mm_and_si128 (a, b); -+ res = _mm_cmpeq_epi32 (res, zero); -+ return _mm_xor_si128(res, one); //invert result -+} -+ -+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 -+#define vtstq_u8 vtstq_s8 -+ -+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 -+#define vtstq_u16 vtstq_s16 -+ -+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 -+#define vtstq_u32 vtstq_s32 -+ -+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 -+#define vtstq_p8 vtstq_u8 -+ -+//****************** Absolute difference ******************** -+//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |***** -+//************************************************************ -+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vabdq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vabdq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vabdq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(vabdq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(vabdq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ return64(vabdq_u32(_pM128i(a), _pM128i(b))); -+} -+ -+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = vabdq_f32(_pM128(a), _pM128(b)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_sub_epi8 (a, b); -+ return _mm_abs_epi8 (res); -+} -+ -+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_sub_epi16 (a,b); -+ return _mm_abs_epi16 (res); -+} -+ -+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0 -+{ -+ __m128i res; -+ res = _mm_sub_epi32 (a,b); -+ return _mm_abs_epi32 (res); -+} -+ -+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned -+{ -+ __m128i cmp, difab, difba; -+ cmp = vcgtq_u8(a,b); -+ difab = _mm_sub_epi8(a,b); -+ difba = _mm_sub_epi8 (b,a); -+ difab = _mm_and_si128(cmp, difab); -+ difba = _mm_andnot_si128(cmp, difba); -+ return _mm_or_si128(difab, difba); -+} -+ -+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) -+{ -+ __m128i cmp, difab, difba; -+ cmp = vcgtq_u16(a,b); -+ difab = _mm_sub_epi16(a,b); -+ difba = _mm_sub_epi16 (b,a); -+ difab = _mm_and_si128(cmp, difab); -+ difba = _mm_andnot_si128(cmp, difba); -+ return _mm_or_si128(difab, difba); -+} -+ -+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) -+{ -+ __m128i cmp, difab, difba; -+ cmp = vcgtq_u32(a,b); -+ difab = _mm_sub_epi32(a,b); -+ difba = _mm_sub_epi32 (b,a); -+ difab = _mm_and_si128(cmp, difab); -+ difba = _mm_andnot_si128(cmp, difba); -+ return _mm_or_si128(difab, difba); -+} -+ -+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0 -+{ -+ __m128i c1; -+ __m128 res; -+ c1 = _mm_set1_epi32(0x7fffffff); -+ res = _mm_sub_ps (a, b); -+ return _mm_and_ps (res, *(__m128*)&c1); -+} -+ -+//************ Absolute difference - long ************************** -+//******************************************************************** -+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0 -+{ -+ __m128i a16, b16; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ return vabdq_s16(a16, b16); -+ -+} -+ -+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0 -+{ -+ __m128i a32, b32; -+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 -+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, -+ return vabdq_s32(a32, b32); -+} -+ -+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //no optimal SIMD solution, serial looks faster -+ _NEON2SSE_ALIGN_16 int64_t res[2]; -+ if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0]; -+ else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0]; -+ if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1]; -+ else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) -+{ -+ __m128i res; -+ res = vsubl_u8(a,b); -+ return _mm_abs_epi16(res); -+} -+ -+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) -+{ -+ __m128i res; -+ res = vsubl_u16(a,b); -+ return _mm_abs_epi32(res); -+} -+ -+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0]; -+ else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0]; -+ if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1]; -+ else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ************* -+//********************************************************************************************* -+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) -+{ -+ int8x8_t res64; -+ return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c))); -+} -+ -+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) -+{ -+ int16x4_t res64; -+ return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c))); -+} -+ -+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) -+{ -+ int32x2_t res64; -+ return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c))); -+} -+ -+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 -+#define vaba_u8 vaba_s8 -+ -+ -+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0 -+#define vaba_u16 vaba_s16 -+ -+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) -+{ -+ uint32x2_t res64; -+ return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c))); -+} -+ -+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0 -+{ -+ int8x16_t sub; -+ sub = vabdq_s8(b, c); -+ return vaddq_s8( a, sub); -+} -+ -+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0 -+{ -+ int16x8_t sub; -+ sub = vabdq_s16(b, c); -+ return vaddq_s16( a, sub); -+} -+ -+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0 -+{ -+ int32x4_t sub; -+ sub = vabdq_s32(b, c); -+ return vaddq_s32( a, sub); -+} -+ -+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) -+{ -+ uint8x16_t sub; -+ sub = vabdq_u8(b, c); -+ return vaddq_u8( a, sub); -+} -+ -+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) -+{ -+ uint16x8_t sub; -+ sub = vabdq_u16(b, c); -+ return vaddq_u16( a, sub); -+} -+ -+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) -+{ -+ uint32x4_t sub; -+ sub = vabdq_u32(b, c); -+ return vaddq_u32( a, sub); -+} -+ -+//************** Absolute difference and accumulate - long ******************************** -+//************************************************************************************* -+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 -+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0 -+{ -+ __m128i b16, c16, res; -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, -+ c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1, -+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); -+ return _mm_add_epi16 (a, res); -+} -+ -+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 -+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0 -+{ -+ __m128i b32, c32, res; -+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1 -+ c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1 -+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); -+ return _mm_add_epi32 (a, res); -+} -+ -+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res; -+ res = vabdl_s32(b,c); -+ return _mm_add_epi64(a, res); -+} -+ -+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 -+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) -+{ -+ __m128i b16, c16, res; -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, -+ c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1, -+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); -+ return _mm_add_epi16 (a, res); -+} -+ -+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0 -+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) -+{ -+ __m128i b32, c32, res; -+ b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1 -+ c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1 -+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); -+ return _mm_add_epi32 (a, res); -+} -+ -+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ __m128i res; -+ res = vabdl_u32(b,c); -+ return _mm_add_epi64(a, res); -+} -+ -+//*********************************************************************************** -+//**************** Maximum and minimum operations ********************************** -+//*********************************************************************************** -+//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ******* -+//*********************************************************************************** -+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_max_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_max_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i res; -+ res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial -+ return64(res); -+} -+ -+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; -+ res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; -+ return res; -+} -+ -+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 -+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1 -+ -+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 -+#define vmaxq_s16 _mm_max_epi16 -+ -+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 -+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1 -+ -+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 -+#define vmaxq_u8 _mm_max_epu8 -+ -+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0 -+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1 -+ -+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 -+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1 -+ -+ -+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 -+#define vmaxq_f32 _mm_max_ps -+ -+//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** -+//*********************************************************************************************************** -+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_min_epi16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits -+ return64(res); -+} -+ -+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ return64(_mm_min_epu8(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b) -+{ -+ uint16x4_t res64; -+ return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b))); -+} -+ -+ -+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b) -+{ -+ uint32x2_t res64; -+ __m128i res; -+ res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial -+ return64(res); -+} -+ -+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; -+ res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; -+ return res; -+} -+ -+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 -+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1 -+ -+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 -+#define vminq_s16 _mm_min_epi16 -+ -+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 -+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1 -+ -+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 -+#define vminq_u8 _mm_min_epu8 -+ -+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0 -+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1 -+ -+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 -+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1 -+ -+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 -+#define vminq_f32 _mm_min_ps -+ -+//************* Pairwise addition operations. ************************************** -+//************************************************************************************ -+//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector -+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit and then pack -+ int8x8_t res64; -+ __m128i a16, b16, res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 -+ res = _mm_hadd_epi16 (a16, b16); -+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits -+ return64(res); -+} -+ -+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ __m128i hadd128; -+ hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b)); -+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(hadd128); -+} -+ -+ -+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ __m128i hadd128; -+ hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b)); -+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(hadd128); -+} -+ -+ -+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0 -+{ -+ // no 8 bit hadd in IA32, need to go to 16 bit and then pack -+ uint8x8_t res64; -+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work -+ __m128i mask8, a16, b16, res; -+ mask8 = _mm_set1_epi16(0xff); -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 -+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 -+ res = _mm_hadd_epi16 (a16, b16); -+ res = _mm_and_si128(res, mask8); //to avoid saturation -+ res = _mm_packus_epi16 (res,res); //use low 64 bits -+ return64(res); -+} -+ -+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0 -+{ -+ // solution may be not optimal, serial execution may be faster -+ // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed -+ uint16x4_t res64; -+ __m128i c32767, cfffe, as, bs, res; -+ c32767 = _mm_set1_epi16 (32767); -+ cfffe = _mm_set1_epi16 (0xfffe); -+ as = _mm_sub_epi16 (_pM128i(a), c32767); -+ bs = _mm_sub_epi16 (_pM128i(b), c32767); -+ res = _mm_hadd_epi16 (as, bs); -+ res = _mm_add_epi16 (res, cfffe); -+ res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(res); -+} -+ -+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 -+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster -+{ -+ //hadd doesn't work for unsigned values -+ uint32x2_t res64; -+ __m128i ab, ab_sh, res; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1 -+ ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0 -+ res = _mm_add_epi32(ab, ab_sh); -+ res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ return64(res); -+} -+ -+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b) -+{ -+ __m128 hadd128; -+ __m64_128 res64; -+ hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b)); -+ hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits -+ _M64f(res64, hadd128); -+ return res64; -+} -+ -+ -+//************************** Long pairwise add ********************************** -+//********************************************************************************* -+//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width, -+// and places the final results in the destination vector. -+ -+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 -+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit anyway -+ __m128i a16; -+ int16x4_t res64; -+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 -+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits -+ return64(a16); -+} -+ -+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 -+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0 -+{ -+ // solution may be not optimal, serial execution may be faster -+ int32x2_t res64; -+ __m128i r32_1; -+ r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a)); -+ r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits -+ return64(r32_1); -+} -+ -+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1]; -+ return res; -+} -+ -+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0 -+{ -+ // no 8 bit hadd in IA32, need to go to 16 bit -+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work -+ uint16x4_t res64; -+ __m128i a16; -+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits -+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits -+ return64(a16); -+} -+ -+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than a SIMD one -+ uint32x2_t res; -+ res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1]; -+ res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3]; -+ return res; -+} -+ -+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1]; -+ return res; -+} -+ -+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 -+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit -+ __m128i r16_1, r16_2; -+ r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 -+ //swap hi and low part of r to process the remaining data -+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ r16_2 = _MM_CVTEPI8_EPI16 (r16_2); -+ return _mm_hadd_epi16 (r16_1, r16_2); -+} -+ -+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 -+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit -+ __m128i r32_1, r32_2; -+ r32_1 = _MM_CVTEPI16_EPI32(a); -+ //swap hi and low part of r to process the remaining data -+ r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ r32_2 = _MM_CVTEPI16_EPI32 (r32_2); -+ return _mm_hadd_epi32 (r32_1, r32_2); -+} -+ -+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0 -+{ -+ _NEON2SSE_ALIGN_16 int32_t atmp[4]; -+ _NEON2SSE_ALIGN_16 int64_t res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ res[0] = (int64_t)atmp[0] + (int64_t)atmp[1]; -+ res[1] = (int64_t)atmp[2] + (int64_t)atmp[3]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 -+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0 -+{ -+ //no 8 bit hadd in IA32, need to go to 16 bit -+ __m128i r16_1, r16_2; -+ r16_1 = _MM_CVTEPU8_EPI16(a); -+ //swap hi and low part of r to process the remaining data -+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ r16_2 = _MM_CVTEPU8_EPI16 (r16_2); -+ return _mm_hadd_epi16 (r16_1, r16_2); -+} -+ -+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than a SIMD one -+ _NEON2SSE_ALIGN_16 uint16_t atmp[8]; -+ _NEON2SSE_ALIGN_16 uint32_t res[4]; -+ _mm_store_si128((__m128i*)atmp, a); -+ res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1]; -+ res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3]; -+ res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5]; -+ res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1]; -+ res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3]; -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//************************ Long pairwise add and accumulate ************************** -+//**************************************************************************************** -+//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector, -+// and accumulates the values of the results into the elements of the destination (wide) vector -+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 -+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) -+{ -+ int16x4_t res64; -+ return64(vpadalq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 -+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) -+{ -+ int32x2_t res64; -+ return64(vpadalq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 -+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0]; -+ return res; -+} -+ -+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) -+{ -+ uint16x4_t res64; -+ return64(vpadalq_u8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0 -+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) -+{ -+ uint32x2_t res64; -+ return64(vpadalq_u16(_pM128i(a), _pM128i(b))); -+} -+ -+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 -+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0]; -+ return res; -+} -+ -+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 -+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0 -+{ -+ int16x8_t pad; -+ pad = vpaddlq_s8(b); -+ return _mm_add_epi16 (a, pad); -+} -+ -+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 -+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0 -+{ -+ int32x4_t pad; -+ pad = vpaddlq_s16(b); -+ return _mm_add_epi32(a, pad); -+} -+ -+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 -+_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) -+{ -+ int64x2_t pad; -+ pad = vpaddlq_s32(b); -+ return _mm_add_epi64 (a, pad); -+} -+ -+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 -+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0 -+{ -+ uint16x8_t pad; -+ pad = vpaddlq_u8(b); -+ return _mm_add_epi16 (a, pad); -+} -+ -+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x4_t pad; -+ pad = vpaddlq_u16(b); -+ return _mm_add_epi32(a, pad); -+} //no optimal SIMD solution, serial is faster -+ -+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //no optimal SIMD solution, serial is faster -+ uint64x2_t pad; -+ pad = vpaddlq_u32(b); -+ return _mm_add_epi64(a, pad); -+} //no optimal SIMD solution, serial is faster -+ -+//********** Folding maximum ************************************* -+//******************************************************************* -+//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors, -+//and copies the larger of each pair into the corresponding element in the destination -+// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison -+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0 -+{ -+ int8x8_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding -+ max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1 -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(max); //we need 64 bits only -+} -+ -+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ int16x4_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask -+ max = _mm_max_epi16 (ab, ab1); -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(max); -+} -+ -+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ int32x2_t res; -+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; -+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; -+ return res; -+} -+ -+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0 -+{ -+ uint8x8_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding -+ max = _mm_max_epu8 (ab, ab1); // SSE4.1 -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(max); -+} -+ -+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ uint16x4_t res64; -+ __m128i ab, ab1, max; -+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask -+ max = _MM_MAX_EPU16 (ab, ab1); -+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(max); -+} -+ -+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ uint32x2_t res; -+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; -+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; -+ return res; -+} //serial solution looks faster than a SIMD one -+ -+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; -+ res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; -+ return res; -+} -+ -+// ***************** Folding minimum **************************** -+// ************************************************************** -+//vpmin -> takes minimum of adjacent pairs -+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0 -+{ -+ int8x8_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding -+ min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1 -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(min); -+} -+ -+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ int16x4_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask -+ min = _mm_min_epi16 (ab, ab1); -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(min); -+} -+ -+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ int32x2_t res; -+ res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; -+ res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; -+ return res; -+} -+ -+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 -+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0 -+{ -+ uint8x8_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; -+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding -+ min = _mm_min_epu8 (ab, ab1); // SSE4.1 -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data -+ return64(min); -+} -+ -+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0 -+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0 -+{ -+ //solution may be not optimal compared with the serial one -+ uint16x4_t res64; -+ __m128i ab, ab1, min; -+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number -+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab -+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask -+ min = _MM_MIN_EPU16 (ab, ab1); -+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask -+ return64(min); -+} -+ -+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ uint32x2_t res; -+ res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0]; -+ res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0]; -+ return res; -+} -+ -+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution looks faster than SIMD one -+ float32x2_t res; -+ res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; -+ res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; -+ return res; -+} -+ -+//*************************************************************** -+//*********** Reciprocal/Sqrt ************************************ -+//*************************************************************** -+//****************** Reciprocal estimate ******************************* -+//the ARM NEON and x86 SIMD results may be slightly different -+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 -+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = _mm_rcp_ps(_pM128(a)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! No reciprocal for ints in IA32 available -+ uint32x2_t res; -+ float resf, r; -+ int i, q, s; -+ for (i =0; i<2; i++){ -+ if((a.m64_u32[i] & 0x80000000) == 0) { -+ res.m64_u32[i] = 0xffffffff; -+ }else{ -+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); -+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ -+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res.m64_u32[i] = r * (uint32_t)(1 << 31); -+ } -+ } -+ return res; -+} -+ -+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 -+#define vrecpeq_f32 _mm_rcp_ps -+ -+ -+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! -+ //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double -+ _NEON2SSE_ALIGN_16 uint32_t atmp[4]; -+ _NEON2SSE_ALIGN_16 uint32_t res[4]; -+ _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000}; -+ float resf, r; -+ int i, q, s; -+ __m128i res128, mask, zero; -+ _mm_store_si128((__m128i*)atmp, a); -+ zero = _mm_setzero_si128(); -+ for (i =0; i<4; i++){ -+ resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31)) -+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ -+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); -+ } -+ res128 = _mm_load_si128((__m128i*)res); -+ mask = _mm_and_si128(a, *(__m128i*)c80000000); -+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff -+ return _mm_or_si128(res128, mask); -+} -+ -+//**********Reciprocal square root estimate **************** -+//********************************************************** -+//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster -+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers -+////the ARM NEON and x86 SIMD results may be slightly different -+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 -+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = _mm_rsqrt_ps(_pM128(a)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! -+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double -+ uint32x2_t res; -+ __m128 tmp; -+ float r, resf, coeff; -+ int i,q0, q1, s;; -+ for (i =0; i<2; i++){ -+ if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff -+ res.m64_u32[i] = 0xffffffff; -+ }else{ -+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); -+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ -+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ -+ r = ((float)q0 + 0.5) / coeff; -+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ -+ _mm_store_ss(&r, tmp); -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res.m64_u32[i] = r * (((uint32_t)1) << 31); -+ } -+ } -+ return res; -+} -+ -+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 -+#define vrsqrteq_f32 _mm_rsqrt_ps -+ -+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //Input is fixed point number!!! -+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double -+ _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4]; -+ _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)}; -+ _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000}; -+ __m128 tmp; -+ __m128i res128, mask, zero; -+ float r, resf, coeff; -+ int i,q0, q1, s; -+ _mm_store_si128((__m128i*)atmp, a); -+ zero = _mm_setzero_si128(); -+ for (i =0; i<4; i++){ -+ resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31))); -+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ -+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ -+ r = ((float)q0 + 0.5) / coeff; -+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ -+ _mm_store_ss(&r, tmp); -+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ -+ r = (float)s / 256.0; -+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); -+ } -+ res128 = _mm_load_si128((__m128i*)res); -+ mask = _mm_and_si128(a, *(__m128i*)c_c0000000); -+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff -+ return _mm_or_si128(res128, mask); -+} -+//************ Reciprocal estimate/step and 1/sqrt estimate/step *************************** -+//****************************************************************************************** -+//******VRECPS (Vector Reciprocal Step) *************************************************** -+//multiplies the elements of one vector by the corresponding elements of another vector, -+//subtracts each of the results from 2, and places the final results into the elements of the destination vector. -+ -+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 -+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x4_t res; -+ __m64_128 res64; -+ res = vrecpsq_f32(_pM128(a), _pM128(b)); -+ _M64f(res64, res); -+ return res64; -+} -+ -+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 -+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0 -+{ -+ __m128 f2, mul; -+ f2 = _mm_set1_ps(2.); -+ mul = _mm_mul_ps(a,b); -+ return _mm_sub_ps(f2,mul); -+} -+ -+//*****************VRSQRTS (Vector Reciprocal Square Root Step) ***************************** -+//multiplies the elements of one vector by the corresponding elements of another vector, -+//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector. -+ -+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 -+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2; -+ res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2; -+ return res; -+} -+ -+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 -+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0 -+{ -+ __m128 f3, f05, mul; -+ f3 = _mm_set1_ps(3.); -+ f05 = _mm_set1_ps(0.5); -+ mul = _mm_mul_ps(a,b); -+ f3 = _mm_sub_ps(f3,mul); -+ return _mm_mul_ps (f3, f05); -+} -+//******************************************************************************************** -+//***************************** Shifts by signed variable *********************************** -+//******************************************************************************************** -+//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) *********************** -+//******************************************************************************************** -+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution -+//helper macro. It matches ARM implementation for big shifts -+#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \ -+ else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \ -+ for (i = 0; i= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \ -+ else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \ -+ return res; -+ -+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(8, i, 8) -+} -+ -+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(16, i, 4) -+} -+ -+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(32, i, 2) -+} -+ -+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(64, i, 1) -+} -+ -+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(8, u, 8) -+} -+ -+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(16, u, 4) -+} -+ -+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT_64(32, u, 2) -+} -+ -+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers -+{ -+ SERIAL_SHIFT_64(64, u, 1) -+} -+ -+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int8_t, int8_t, 16, 16) -+} -+ -+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int16_t, int16_t, 8, 8) -+} -+ -+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int32_t, int32_t, 4, 4) -+} -+ -+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(int64_t, int64_t, 2, 2) -+} -+ -+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint8_t, int8_t, 16, 16) -+} -+ -+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint16_t, int16_t, 8, 8) -+} -+ -+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint32_t, int32_t, 4, 4) -+} -+ -+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SHIFT(uint64_t, int64_t, 2, 2) -+} -+ -+ -+//*********** Vector saturating shift left: (negative values shift right) ********************** -+//******************************************************************************************** -+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution -+#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ -+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i]); \ -+ else{ \ -+ if (btmp[i]>lanesize_1) { \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ -+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ else res[i] = atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ -+ TYPE lanesize = (sizeof(TYPE) << 3); \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i]); \ -+ else{ \ -+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ -+ else{ \ -+ limit = (TYPE) 1 << (lanesize - btmp[i]); \ -+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \ -+ int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize_1) { \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ -+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ -+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ -+ return res; -+ -+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ -+ int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i])); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ -+ else{ \ -+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ -+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \ -+ return res; -+ -+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(8,8) -+} -+ -+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(16,4) -+} -+ -+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(32,2) -+} -+ -+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED_64(64,1) -+} -+ -+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8) -+} -+ -+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4) -+} -+ -+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2) -+} -+ -+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1) -+} -+ -+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16) -+} -+ -+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8) -+} -+ -+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4) -+} -+ -+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2) -+} -+ -+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16) -+} -+ -+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8) -+} -+ -+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4) -+} -+ -+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2) -+} -+ -+ -+//******** Vector rounding shift left: (negative values shift right) ********** -+//**************************************************************************** -+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution -+//rounding makes sense for right shifts only. -+#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i= 0) { \ -+ if(btmp[i] >= lanesize) res[i] = 0; \ -+ else res[i] = (atmp[i] << btmp[i]); \ -+ }else{ \ -+ res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \ -+ (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \ -+ (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \ -+ return _mm_load_si128((__m128i*)res); -+ -+ -+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \ -+ int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \ -+ for (i = 0; i= 0) { \ -+ if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \ -+ else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \ -+ }else{ \ -+ res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \ -+ (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \ -+ (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \ -+ return res; -+ -+ -+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(8,i,8) -+} -+ -+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(16,i,4) -+} -+ -+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(32,i,2) -+} -+ -+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(64,i,1) -+} -+ -+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(8,u,8) -+} -+ -+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(16,u,4) -+} -+ -+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(32,u,2) -+} -+ -+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT_64(64,u,1) -+} -+ -+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16) -+} -+ -+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8) -+} -+ -+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4) -+} -+ -+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2) -+} -+ -+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16) -+} -+ -+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8) -+} -+ -+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4) -+} -+ -+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2) -+} -+ -+ -+//********** Vector saturating rounding shift left: (negative values shift right) **************** -+//************************************************************************************************* -+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution -+//Saturation happens for left shifts only while rounding makes sense for right shifts only. -+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ -+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ -+ else{ \ -+ if (btmp[i]>lanesize_1) { \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ -+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ -+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ -+ else res[i] = atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ -+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ -+ int lanesize = (sizeof(TYPE) << 3); \ -+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ -+ for (i = 0; i> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ -+ else{ \ -+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ -+ else{ \ -+ limit = (TYPE) 1 << (lanesize - btmp[i]); \ -+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ -+ return _mm_load_si128((__m128i*)res); -+ -+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \ -+ __m64_128 res; int ## TYPE ## _t limit; int i; \ -+ int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize_1) { \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ -+ }else{ \ -+ limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ -+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ -+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ -+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ -+ return res; -+ -+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \ -+ __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ -+ int lanesize = (sizeof(int ## TYPE ## _t) << 3); \ -+ for (i = 0; i> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ -+ else{ \ -+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ -+ else{ \ -+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ -+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ -+ return res; -+ -+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8) -+} -+ -+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4) -+} -+ -+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2) -+} -+ -+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1) -+} -+ -+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8) -+} -+ -+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4) -+} -+ -+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2) -+} -+ -+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1) -+} -+ -+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16) -+} -+ -+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8) -+} -+ -+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4) -+} -+ -+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2) -+} -+ -+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16) -+} -+ -+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8) -+} -+ -+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4) -+} -+ -+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2) -+} -+ -+// ********************************************************************************* -+// ***************************** Shifts by a constant ***************************** -+// ********************************************************************************* -+//**************** Vector shift right by constant************************************* -+//************************************************************************************ -+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit -+ int8x8_t res64; -+ __m128i r; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_srai_epi16 (r, b); //SSE2 -+ r = _mm_packs_epi16 (r,r); //we need 64 bits only -+ return64(r); -+} -+ -+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b) -+{ -+ int16x4_t res64; -+ return64(_mm_srai_epi16(_pM128i(a), b)); -+} -+ -+ -+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ return64(_mm_srai_epi32(_pM128i(a), b)); -+} -+ -+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //no arithmetic shift for 64bit values, serial solution used -+ int64x1_t res; -+ if(b>=64) res.m64_i64[0] = 0; -+ else res.m64_i64[0] = (*(int64_t*)&a) >> b; -+ return res; -+} -+ -+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit -+ uint8x8_t res64; -+ __m128i r; -+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one -+ r = _mm_packus_epi16 (r,r); //we need 64 bits only -+ return64(r); -+} -+ -+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b) -+{ -+ uint16x4_t res64; -+ return64(_mm_srli_epi16(_pM128i(a), b)); -+} -+ -+ -+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res64; -+ return64(_mm_srli_epi32(_pM128i(a), b)); -+} -+ -+ -+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 -+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b) -+{ -+ uint64x1_t res64; -+ return64(_mm_srli_epi64(_pM128i(a), b)); -+} -+ -+ -+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit trick -+ __m128i zero, mask0, a_sign, r, a_sign_mask; -+ _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff}; -+ zero = _mm_setzero_si128(); -+ mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift -+ a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0 -+ r = _mm_srai_epi16 (a, b); -+ a_sign_mask = _mm_and_si128 (mask0, a_sign); -+ r = _mm_andnot_si128 (mask0, r); -+ return _mm_or_si128 (r, a_sign_mask); -+} -+ -+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 -+#define vshrq_n_s16 _mm_srai_epi16 -+ -+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 -+#define vshrq_n_s32 _mm_srai_epi32 -+ -+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b) -+{ -+ //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD -+ __m128i c1, signmask,a0, res64; -+ _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000}; -+ c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff -+ signmask = _mm_slli_epi64 (c1, (64 - b)); -+ a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit -+ a0 = _MM_CMPEQ_EPI64 (a, a0); -+ signmask = _mm_and_si128(a0, signmask); -+ res64 = _mm_srli_epi64 (a, b); -+ return _mm_or_si128(res64, signmask); -+} -+ -+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8 -+{ -+ //no 8 bit shift available, need the special trick -+ __m128i mask0, r; -+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00}; -+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift -+ r = _mm_srli_epi16 ( a, b); -+ return _mm_and_si128 (r, mask0); -+} -+ -+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16 -+#define vshrq_n_u16 _mm_srli_epi16 -+ -+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 -+#define vshrq_n_u32 _mm_srli_epi32 -+ -+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 -+#define vshrq_n_u64 _mm_srli_epi64 -+ -+//*************************** Vector shift left by constant ************************* -+//********************************************************************************* -+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0 -+{ -+ //no 8 bit shift available, go to 16 bit -+ int8x8_t res64; -+ __m128i r; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_slli_epi16 (r, b); //SSE2 -+ r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only -+ return64(r); -+} -+ -+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b) -+{ -+ int16x4_t res64; -+ return64(_mm_slli_epi16(_pM128i(a), b)); -+} -+ -+ -+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b) -+{ -+ int32x2_t res64; -+ return64(_mm_slli_epi32(_pM128i(a), b)); -+} -+ -+ -+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b) -+{ -+ int64x1_t res64; -+ return64(_mm_slli_epi64(_pM128i(a), b)); -+} -+ -+ -+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 -+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b) -+{ -+ //no 8 bit shift available, go to 16 bit -+ uint8x8_t res64; -+ __m128i mask8; -+ __m128i r; -+ mask8 = _mm_set1_epi16(0xff); -+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r = _mm_slli_epi16 (r, b); //SSE2 -+ r = _mm_and_si128(r, mask8); //to avoid saturation -+ r = _mm_packus_epi16 (r,r); //we need 64 bits only -+ return64(r); -+} -+ -+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 -+#define vshl_n_u16 vshl_n_s16 -+ -+ -+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 -+#define vshl_n_u32 vshl_n_s32 -+ -+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 -+#define vshl_n_u64 vshl_n_s64 -+ -+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+#define vshlq_n_s8 vshlq_n_u8 -+ -+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+#define vshlq_n_s16 _mm_slli_epi16 -+ -+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+#define vshlq_n_s32 _mm_slli_epi32 -+ -+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+#define vshlq_n_s64 _mm_slli_epi64 -+ -+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 -+_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) -+{ -+ //no 8 bit shift available, need the special trick -+ __m128i mask0, r; -+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff}; -+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift -+ r = _mm_slli_epi16 ( a, b); -+ return _mm_and_si128 (r, mask0); -+} -+ -+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 -+#define vshlq_n_u16 vshlq_n_s16 -+ -+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 -+#define vshlq_n_u32 vshlq_n_s32 -+ -+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 -+#define vshlq_n_u64 vshlq_n_s64 -+ -+//************* Vector rounding shift right by constant ****************** -+//************************************************************************* -+//No corresponding x86 intrinsics exist, need to do some tricks -+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit -+ int8x8_t res64; -+ __m128i r, maskb; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 -+ r = _mm_srai_epi16 (r, b); -+ r = _mm_add_epi16 (r, maskb); //actual rounding -+ r = _mm_packs_epi16 (r,r); ////we need 64 bits only -+ return64(r); -+} -+ -+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b) -+{ -+ int16x4_t res64; -+ return64(vrshrq_n_s16(_pM128i(a), b)); -+} -+ -+ -+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ return64(vrshrq_n_s32(_pM128i(a), b)); -+} -+ -+ -+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ //serial solution is faster -+ int64x1_t res; -+ int64_t a_i64 = *( int64_t*)&a; -+ if(b==64) { -+ res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63; -+ } else { -+ int64_t maskb = a_i64 & (( int64_t)1 << (b - 1)); -+ res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1)); -+ } -+ return res; -+} -+ -+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one -+ uint8x8_t res64; -+ __m128i r, maskb; -+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 -+ r = _mm_srli_epi16 (r, b); -+ r = _mm_add_epi16 (r, maskb); //actual rounding -+ r = _mm_packus_epi16 (r,r); ////we need 64 bits only -+ return64(r); -+} -+ -+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b) -+{ -+ uint16x4_t res64; -+ return64(vrshrq_n_u16(_pM128i(a), b)); -+} -+ -+ -+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res64; -+ return64(vrshrq_n_u32(_pM128i(a), b)); -+} -+ -+ -+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 -+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b) -+{ -+ uint64x1_t res64; -+ return64(vrshrq_n_u64(_pM128i(a), b)); -+} -+ -+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit trick -+ __m128i r, mask1, maskb; -+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 -+ r = vshrq_n_s8 (a, b); -+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding -+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding -+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 -+ return _mm_add_epi8(r, maskb); //actual rounding -+} -+ -+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 -+ r = _mm_srai_epi16 (a, b); -+ return _mm_add_epi16 (r, maskb); //actual rounding -+} -+ -+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 -+ r = _mm_srai_epi32(a, b); -+ return _mm_add_epi32 (r, maskb); //actual rounding -+} -+ -+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b) -+{ -+ //solution may be not optimal compared with a serial one -+ __m128i maskb; -+ int64x2_t r; -+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 -+ r = vshrq_n_s64(a, b); -+ return _mm_add_epi64 (r, maskb); //actual rounding -+} -+ -+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8 -+{ -+ //no 8 bit shift available, go to 16 bit trick -+ __m128i r, mask1, maskb; -+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 -+ r = vshrq_n_u8 (a, b); -+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding -+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding -+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 -+ return _mm_add_epi8(r, maskb); //actual rounding -+} -+ -+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16 -+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0 -+ r = _mm_srli_epi16 (a, b); -+ return _mm_add_epi16 (r, maskb); //actual rounding -+} -+ -+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 -+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 -+{ -+ __m128i maskb, r; -+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0 -+ r = _mm_srli_epi32(a, b); -+ return _mm_add_epi32 (r, maskb); //actual rounding -+} -+ -+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 -+_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b) -+{ -+ //solution may be not optimal compared with a serial one -+ __m128i maskb, r; -+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit -+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0 -+ r = _mm_srli_epi64(a, b); -+ return _mm_add_epi64 (r, maskb); //actual rounding -+} -+ -+//************* Vector shift right by constant and accumulate ********* -+//********************************************************************* -+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8 -+{ -+ int8x8_t shift; -+ shift = vshr_n_s8(b, c); -+ return vadd_s8( a, shift); -+} -+ -+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16 -+{ -+ int16x4_t shift; -+ shift = vshr_n_s16( b, c); -+ return vadd_s16(a, shift); -+} -+ -+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ int32x2_t shift; -+ shift = vshr_n_s32(b, c); -+ return vadd_s32( a, shift); -+} -+ -+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 -+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) -+{ -+ //may be not optimal compared with a serial solution -+ int64x1_t shift; -+ shift = vshr_n_s64(b, c); -+ return vadd_s64( a, shift); -+} -+ -+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8 -+{ -+ uint8x8_t shift; -+ shift = vshr_n_u8(b, c); -+ return vadd_u8(a, shift); -+} -+ -+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16 -+{ -+ uint16x4_t shift; -+ shift = vshr_n_u16(b, c); -+ return vadd_u16(a,shift); -+} -+ -+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ uint32x2_t shift; -+ shift = vshr_n_u32(b, c); -+ return vadd_u32( a, shift); -+} -+ -+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 -+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64 -+{ -+ //may be not optimal compared with the serial execution -+ uint64x1_t shift; -+ shift = vshr_n_u64(b, c); -+ return vadd_u64(a, shift); -+} -+ -+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8 -+{ -+ int8x16_t shift; -+ shift = vshrq_n_s8(b, c); -+ return vaddq_s8(a, shift); -+} -+ -+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16 -+{ -+ int16x8_t shift; -+ shift = vshrq_n_s16(b, c); -+ return vaddq_s16(a, shift); -+} -+ -+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32 -+{ -+ int32x4_t shift; -+ shift = vshrq_n_s32(b, c); -+ return vaddq_s32(a, shift); -+} -+ -+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64 -+{ -+ int64x2_t shift; -+ shift = vshrq_n_s64(b, c); -+ return vaddq_s64( a, shift); -+} -+ -+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8 -+{ -+ uint8x16_t shift; -+ shift = vshrq_n_u8(b, c); -+ return vaddq_u8(a, shift); -+} -+ -+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16 -+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16 -+{ -+ uint16x8_t shift; -+ shift = vshrq_n_u16(b, c); -+ return vaddq_u16(a, shift); -+} -+ -+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 -+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32 -+{ -+ uint32x4_t shift; -+ shift = vshrq_n_u32(b, c); -+ return vaddq_u32(a, shift); -+} -+ -+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 -+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64 -+{ -+ uint64x2_t shift; -+ shift = vshrq_n_u64(b, c); -+ return vaddq_u64(a, shift); -+} -+ -+//************* Vector rounding shift right by constant and accumulate **************************** -+//************************************************************************************************ -+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8 -+{ -+ int8x8_t shift; -+ shift = vrshr_n_s8(b, c); -+ return vadd_s8( a, shift); -+} -+ -+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16 -+{ -+ int16x4_t shift; -+ shift = vrshr_n_s16( b, c); -+ return vadd_s16(a, shift); -+} -+ -+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ int32x2_t shift; -+ shift = vrshr_n_s32(b, c); -+ return vadd_s32( a, shift); -+} -+ -+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution -+{ -+ int64x1_t shift; -+ shift = vrshr_n_s64(b, c); -+ return vadd_s64( a, shift); -+} -+ -+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 -+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8 -+{ -+ uint8x8_t shift; -+ shift = vrshr_n_u8(b, c); -+ return vadd_u8(a, shift); -+} -+ -+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16 -+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16 -+{ -+ uint16x4_t shift; -+ shift = vrshr_n_u16(b, c); -+ return vadd_u16(a,shift); -+} -+ -+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 -+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32 -+{ -+ //may be not optimal compared with the serial execution -+ uint32x2_t shift; -+ shift = vrshr_n_u32(b, c); -+ return vadd_u32( a, shift); -+} -+ -+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution -+{ -+ //may be not optimal compared with the serial execution -+ uint64x1_t shift; -+ shift = vrshr_n_u64(b, c); -+ return vadd_u64( a, shift); -+} -+ -+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8 -+{ -+ int8x16_t shift; -+ shift = vrshrq_n_s8(b, c); -+ return vaddq_s8(a, shift); -+} -+ -+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16 -+{ -+ int16x8_t shift; -+ shift = vrshrq_n_s16(b, c); -+ return vaddq_s16(a, shift); -+} -+ -+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32 -+{ -+ int32x4_t shift; -+ shift = vrshrq_n_s32(b, c); -+ return vaddq_s32(a, shift); -+} -+ -+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) -+{ -+ int64x2_t shift; -+ shift = vrshrq_n_s64(b, c); -+ return vaddq_s64(a, shift); -+} -+ -+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 -+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8 -+{ -+ uint8x16_t shift; -+ shift = vrshrq_n_u8(b, c); -+ return vaddq_u8(a, shift); -+} -+ -+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16 -+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16 -+{ -+ uint16x8_t shift; -+ shift = vrshrq_n_u16(b, c); -+ return vaddq_u16(a, shift); -+} -+ -+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 -+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32 -+{ -+ uint32x4_t shift; -+ shift = vrshrq_n_u32(b, c); -+ return vaddq_u32(a, shift); -+} -+ -+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 -+_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) -+{ -+ uint64x2_t shift; -+ shift = vrshrq_n_u64(b, c); -+ return vaddq_u64(a, shift); -+} -+ -+//**********************Vector saturating shift left by constant ***************************** -+//******************************************************************************************** -+//we don't check const ranges assuming they are met -+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 -+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0 -+{ -+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) -+ int8x8_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi16 (a128, b); -+ r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only -+ return64(r128); -+} -+ -+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 -+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0 -+{ -+ // go to 32 bit to get the auto saturation (in packs function) -+ int16x4_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi32 (a128, b); //shift_res -+ r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only -+ return64(r128); -+} -+ -+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b) -+{ -+ //serial execution may be faster -+ int32x2_t res64; -+ return64(vqshlq_n_s32 (_pM128i(a), b)); -+} -+ -+ -+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ int64x1_t res; -+ int64_t bmask; -+ int64_t a_i64 = *( int64_t*)&a; -+ bmask = ( int64_t)1 << (63 - b); //positive -+ if (a_i64 >= bmask) { -+ res.m64_i64[0] = ~(_SIGNBIT64); -+ } else { -+ res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b; -+ } -+ return res; -+} -+ -+ -+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 -+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0 -+{ -+ //no 8 bit shift available in IA32 SIMD, go to 16 bit -+ uint8x8_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi16 (a128, b); //shift_res -+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only -+ return64(r128); -+} -+ -+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0 -+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0 -+{ -+ // go to 32 bit to get the auto saturation (in packus function) -+ uint16x4_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi32 (a128, b); //shift_res -+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16 -+ return64(r128); -+} -+ -+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 -+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b) -+{ -+ uint32x2_t res64; -+ return64(vqshlq_n_u32(_pM128i(a), b)); -+} -+ -+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ uint64x1_t res; -+ uint64_t bmask; -+ uint64_t a_i64 = *(uint64_t*)&a; -+ bmask = ( uint64_t)1 << (64 - b); -+ res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a -+ return res; -+} -+ -+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 -+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0 -+{ -+ // go to 16 bit to get the auto saturation (in packs function) -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi16 (a128, b); -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI8_EPI16 (a128); -+ r128_2 = _mm_slli_epi16 (a128, b); -+ return _mm_packs_epi16 (r128_1, r128_2); //saturated s8 -+} -+ -+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 -+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0 -+{ -+ // manual saturation solution looks LESS optimal than 32 bits conversion one -+ // go to 32 bit to get the auto saturation (in packs function) -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI16_EPI32 (a128); -+ r128_2 = _mm_slli_epi32 (a128, b); -+ return _mm_packs_epi32 (r128_1, r128_2); //saturated s16 -+} -+ -+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 -+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0 -+{ -+ // no 64 bit saturation option available, special tricks necessary -+ __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask; -+ c1 = _mm_cmpeq_epi32(a,a); //0xff..ff -+ maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones -+ saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise -+ c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not -+ shift_res = _mm_slli_epi32 (a, b); -+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); -+ //result with positive numbers saturated -+ shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask); -+ //treat negative numbers -+ maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros -+ saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise -+ c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not -+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); -+ return _mm_or_si128 (c7ffffff_mask, shift_res_mask); -+} -+ -+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2]; -+ int64_t bmask; -+ int i; -+ bmask = ( int64_t)1 << (63 - b); //positive -+ _mm_store_si128((__m128i*)atmp, a); -+ for (i = 0; i<2; i++) { -+ if (atmp[i] >= bmask) { -+ res[i] = ~(_SIGNBIT64); -+ } else { -+ res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b; -+ } -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 -+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0 -+{ -+ // go to 16 bit to get the auto saturation (in packs function) -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi16 (a128, b); -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPU8_EPI16 (a128); -+ r128_2 = _mm_slli_epi16 (a128, b); -+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 -+} -+ -+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0 -+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0 -+{ -+ // manual saturation solution looks more optimal than 32 bits conversion one -+ __m128i cb, c8000, a_signed, saturation_mask, shift_res; -+ cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 ); -+ c8000 = _mm_set1_epi16 (0x8000); -+//no unsigned shorts comparison in SSE, only signed available, so need the trick -+ a_signed = _mm_sub_epi16(a, c8000); //go to signed -+ saturation_mask = _mm_cmpgt_epi16 (a_signed, cb); -+ shift_res = _mm_slli_epi16 (a, b); -+ return _mm_or_si128 (shift_res, saturation_mask); -+} -+ -+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 -+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0 -+{ -+ // manual saturation solution, no 64 bit saturation option, the serial version may be faster -+ __m128i cb, c80000000, a_signed, saturation_mask, shift_res; -+ cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 ); -+ c80000000 = _mm_set1_epi32 (0x80000000); -+//no unsigned ints comparison in SSE, only signed available, so need the trick -+ a_signed = _mm_sub_epi32(a, c80000000); //go to signed -+ saturation_mask = _mm_cmpgt_epi32 (a_signed, cb); -+ shift_res = _mm_slli_epi32 (a, b); -+ return _mm_or_si128 (shift_res, saturation_mask); -+} -+ -+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here -+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2]; -+ uint64_t bmask; -+ int i; -+ bmask = ( uint64_t)1 << (64 - b); -+ _mm_store_si128((__m128i*)atmp, a); -+ for (i = 0; i<2; i++) { -+ res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//**************Vector signed->unsigned saturating shift left by constant ************* -+//************************************************************************************* -+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 -+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0 -+{ -+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) -+ uint8x8_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi16 (a128, b); -+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only -+ return64(r128); -+} -+ -+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 -+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0 -+{ -+ uint16x4_t res64; -+ __m128i a128, r128; -+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 -+ r128 = _mm_slli_epi32 (a128, b); //shift_res -+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only -+ return64(r128); -+} -+ -+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b) -+{ -+ int32x2_t res64; -+ return64( vqshluq_n_s32(_pM128i(a), b)); -+} -+ -+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster -+{ -+ uint64x1_t res; -+ uint64_t limit; -+ if (a.m64_i64[0]<=0) { -+ res.m64_u64[0] = 0; -+ } else { -+ limit = (uint64_t) 1 << (64 - b); -+ res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b; -+ } -+ return res; -+} -+ -+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 -+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0 -+{ -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi16 (a128, b); -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI8_EPI16 (a128); -+ r128_2 = _mm_slli_epi16 (a128, b); -+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 -+} -+ -+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 -+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0 -+{ -+ // manual saturation solution looks LESS optimal than 32 bits conversion one -+ __m128i a128, r128_1, r128_2; -+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 -+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res -+ //swap hi and low part of a128 to process the remaining data -+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); -+ a128 = _MM_CVTEPI16_EPI32 (a128); -+ r128_2 = _mm_slli_epi32 (a128, b); -+ return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16 -+} -+ -+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 -+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0 -+{ -+ //solution may be not optimal compared with the serial one -+ __m128i zero, maskA, maskGT0, a0, a_masked, a_shift; -+ zero = _mm_setzero_si128(); -+ maskA = _mm_cmpeq_epi32(a, a); -+ maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros -+ //saturate negative numbers to zero -+ maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers) -+ a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now -+ //saturate positive to 0xffffffff -+ a_masked = _mm_and_si128 (a0, maskA); -+ a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise -+ a_shift = _mm_slli_epi32 (a0, b); -+ return _mm_or_si128 (a_shift, a_masked); //actual saturation -+} -+ -+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ // no effective SIMD solution here, serial execution looks faster -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ _NEON2SSE_ALIGN_16 uint64_t res[2]; -+ uint64_t limit; -+ int i; -+ _mm_store_si128((__m128i*)atmp, a); -+ for (i = 0; i<2; i++) { -+ if (atmp[i]<=0) { -+ res[i] = 0; -+ } else { -+ limit = (uint64_t) 1 << (64 - b); -+ res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b; -+ } -+ } -+ return _mm_load_si128((__m128i*)res); -+} -+ -+//************** Vector narrowing shift right by constant ************** -+//********************************************************************** -+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r16 = vshrq_n_s16(a,b); -+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r16); -+} -+ -+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r32 = vshrq_n_s32(a,b); -+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r32); -+} -+ -+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ __m128i r64; -+ r64 = vshrq_n_s64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 -+{ -+ uint8x8_t res64; -+ __m128i mask, r16; -+ mask = _mm_set1_epi16(0xff); -+ r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_and_si128(r16, mask); //to avoid saturation -+ r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i mask, r32; -+ mask = _mm_set1_epi32(0xffff); -+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16) -+ r32 = _mm_and_si128(r32, mask); //to avoid saturation -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res64; -+ __m128i r64; -+ r64 = vshrq_n_u64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+//************** Vector signed->unsigned narrowing saturating shift right by constant ******** -+//********************************************************************************************* -+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8 -+{ -+ uint8x8_t res64; -+ __m128i r16; -+ r16 = vshrq_n_s16(a,b); -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i r32; -+ r32 = vshrq_n_s32(a,b); -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster -+{ -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ uint32x2_t res; -+ int64_t res64; -+ _mm_store_si128((__m128i*)atmp, a); -+ if (atmp[0] < 0) { -+ res.m64_u32[0] = 0; -+ } else { -+ res64 = (atmp[0] >> b); -+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64; -+ } -+ if (atmp[1] < 0) { -+ res.m64_u32[1] = 0; -+ } else { -+ res64 = (atmp[1] >> b); -+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64; -+ } -+ return res; -+} -+ -+//**** Vector signed->unsigned rounding narrowing saturating shift right by constant ***** -+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8 -+{ -+ //solution may be not optimal compared with the serial one -+ __m128i r16; -+ uint8x8_t res64; -+ r16 = vrshrq_n_s16(a,b); -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16 -+{ -+ //solution may be not optimal compared with the serial one -+ __m128i r32; -+ uint16x4_t res64; -+ r32 = vrshrq_n_s32(a,b); -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster -+{ -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ uint32x2_t res; -+ int64_t res64; -+ _mm_store_si128((__m128i*)atmp, a); -+ if (atmp[0] < 0) { -+ res.m64_u32[0] = 0; -+ } else { -+ res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); -+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; -+ } -+ if (atmp[1] < 0) { -+ res.m64_u32[1] = 0; -+ } else { -+ res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); -+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64; -+ } -+ return res; -+} -+ -+//***** Vector narrowing saturating shift right by constant ****** -+//***************************************************************** -+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ r16 = vshrq_n_s16(a,b); -+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ r32 = vshrq_n_s32(a,b); -+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //no optimal SIMD solution found -+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2]; -+ int32x2_t res; -+ _mm_store_si128((__m128i*)atmp, a); -+ res64[0] = (atmp[0] >> b); -+ res64[1] = (atmp[1] >> b); -+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; -+ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; -+ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i r32; -+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) -+{ -+ //serial solution may be faster -+ uint32x2_t res64; -+ __m128i r64, res_hi, zero; -+ zero = _mm_setzero_si128(); -+ r64 = vshrq_n_u64(a,b); -+ res_hi = _mm_srli_epi64(r64, 32); -+ res_hi = _mm_cmpgt_epi32(res_hi, zero); -+ r64 = _mm_or_si128(r64, res_hi); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+ -+//********* Vector rounding narrowing shift right by constant ************************* -+//**************************************************************************************** -+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r16 = vrshrq_n_s16(a,b); -+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r16); -+} -+ -+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ r32 = vrshrq_n_s32(a,b); -+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems -+ return64(r32); -+} -+ -+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ __m128i r64; -+ r64 = vrshrq_n_s64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 -+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 -+{ -+ uint8x8_t res64; -+ __m128i mask, r16; -+ mask = _mm_set1_epi16(0xff); -+ r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_and_si128(r16, mask); //to avoid saturation -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i mask, r32; -+ mask = _mm_set1_epi32(0xffff); -+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) -+ r32 = _mm_and_si128(r32, mask); //to avoid saturation -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster -+{ -+ uint32x2_t res64; -+ __m128i r64; -+ r64 = vrshrq_n_u64(a,b); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+//************* Vector rounding narrowing saturating shift right by constant ************ -+//**************************************************************************************** -+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 -+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8 -+{ -+ int8x8_t res64; -+ __m128i r16; -+ r16 = vrshrq_n_s16(a,b); -+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 -+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16 -+{ -+ int16x4_t res64; -+ __m128i r32; -+ r32 = vrshrq_n_s32(a,b); -+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //no optimal SIMD solution found -+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2]; -+ int32x2_t res; -+ _mm_store_si128((__m128i*)atmp, a); -+ maskb[0] = atmp[0] & (( int64_t)1 << (b - 1)); -+ res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result -+ maskb[1] = atmp[1] & (( int64_t)1 << (b - 1)); -+ res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result -+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; -+ if(res64[0]SINT_MAX) res64[1] = SINT_MAX; -+ if(res64[1]=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) -+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only -+ return64(r16); -+} -+ -+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 -+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16 -+{ -+ uint16x4_t res64; -+ __m128i r32; -+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) -+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only -+ return64(r32); -+} -+ -+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 -+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) -+{ -+ //serial solution may be faster -+ uint32x2_t res64; -+ __m128i r64, res_hi, zero; -+ zero = _mm_setzero_si128(); -+ r64 = vrshrq_n_u64(a,b); -+ res_hi = _mm_srli_epi64(r64, 32); -+ res_hi = _mm_cmpgt_epi32(res_hi, zero); -+ r64 = _mm_or_si128(r64, res_hi); -+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(r64); -+} -+ -+//************** Vector widening shift left by constant **************** -+//************************************************************************ -+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 -+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0 -+{ -+ __m128i r; -+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 -+ return _mm_slli_epi16 (r, b); -+} -+ -+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 -+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0 -+{ -+ __m128i r; -+ r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1, -+ return _mm_slli_epi32 (r, b); -+} -+ -+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 -+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0 -+{ -+ __m128i r; -+ r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1, -+ return _mm_slli_epi64 (r, b); -+} -+ -+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 -+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0 -+{ -+ //no uint8 to uint16 conversion available, manual conversion used -+ __m128i zero, r; -+ zero = _mm_setzero_si128 (); -+ r = _mm_unpacklo_epi8(_pM128i(a), zero); -+ return _mm_slli_epi16 (r, b); -+} -+ -+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0 -+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0 -+{ -+ //no uint16 to uint32 conversion available, manual conversion used -+ __m128i zero, r; -+ zero = _mm_setzero_si128 (); -+ r = _mm_unpacklo_epi16(_pM128i(a), zero); -+ return _mm_slli_epi32 (r, b); -+} -+ -+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 -+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0 -+{ -+ //no uint32 to uint64 conversion available, manual conversion used -+ __m128i zero, r; -+ zero = _mm_setzero_si128 (); -+ r = _mm_unpacklo_epi32(_pM128i(a), zero); -+ return _mm_slli_epi64 (r, b); -+} -+ -+//************************************************************************************ -+//**************************** Shifts with insert ************************************ -+//************************************************************************************ -+//takes each element in a vector, shifts them by an immediate value, -+//and inserts the results in the destination vector. Bits shifted out of the each element are lost. -+ -+//**************** Vector shift right and insert ************************************ -+//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift. -+//All other bits are taken from b shifted. -+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) -+{ -+ int8x8_t res64; -+ return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) -+{ -+ int16x4_t res64; -+ return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) -+{ -+ int32x2_t res64; -+ return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) -+{ -+ int64x1_t res; -+ if (c ==64) -+ res = a; -+ else{ -+ res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros -+ } -+ return res; -+} -+ -+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+#define vsri_n_u8 vsri_n_s8 -+ -+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+#define vsri_n_u16 vsri_n_s16 -+ -+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 -+#define vsri_n_u32 vsri_n_s32 -+ -+ -+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 -+#define vsri_n_u64 vsri_n_s64 -+ -+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 -+#define vsri_n_p8 vsri_n_u8 -+ -+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 -+#define vsri_n_p16 vsri_n_u16 -+ -+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8 -+{ -+ __m128i maskA, a_masked; -+ uint8x16_t b_shift; -+ _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used -+ maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros -+ a_masked = _mm_and_si128 (a, maskA); -+ b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift -+ return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a) -+} -+ -+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16 -+{ -+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a -+ uint16x8_t b_shift; -+ uint16x8_t a_c; -+ b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift -+ a_c = vshrq_n_u16( a, (16 - c)); -+ a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a -+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) -+} -+ -+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32 -+{ -+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a -+ uint32x4_t b_shift; -+ uint32x4_t a_c; -+ b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift -+ a_c = vshrq_n_u32( a, (32 - c)); -+ a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a -+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) -+} -+ -+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) -+{ -+ //serial solution may be faster -+ uint64x2_t b_shift; -+ uint64x2_t a_c; -+ b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift -+ a_c = _mm_srli_epi64(a, (64 - c)); -+ a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a -+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) -+} -+ -+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+#define vsriq_n_u8 vsriq_n_s8 -+ -+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+#define vsriq_n_u16 vsriq_n_s16 -+ -+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 -+#define vsriq_n_u32 vsriq_n_s32 -+ -+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 -+#define vsriq_n_u64 vsriq_n_s64 -+ -+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 -+#define vsriq_n_p8 vsriq_n_u8 -+ -+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 -+#define vsriq_n_p16 vsriq_n_u16 -+ -+//***** Vector shift left and insert ********************************************* -+//********************************************************************************* -+//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift. -+//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted". -+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c) -+{ -+ int8x8_t res64; -+ return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c) -+{ -+ int16x4_t res64; -+ return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c)); -+} -+ -+ -+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c) -+{ -+ int32x2_t res64; -+ return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c)); -+} -+ -+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros -+ return res; -+} -+ -+ -+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+#define vsli_n_u8 vsli_n_s8 -+ -+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+#define vsli_n_u16 vsli_n_s16 -+ -+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 -+#define vsli_n_u32 vsli_n_s32 -+ -+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 -+#define vsli_n_u64 vsli_n_s64 -+ -+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 -+#define vsli_n_p8 vsli_n_u8 -+ -+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 -+#define vsli_n_p16 vsli_n_u16 -+ -+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0 -+{ -+ __m128i maskA, a_masked; -+ int8x16_t b_shift; -+ _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask -+ maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones -+ b_shift = vshlq_n_s8( b, c); -+ a_masked = _mm_and_si128 (a, maskA); -+ return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a) -+} -+ -+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0 -+{ -+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a -+ int16x8_t b_shift; -+ int16x8_t a_c; -+ b_shift = vshlq_n_s16( b, c); -+ a_c = vshlq_n_s16( a, (16 - c)); -+ a_c = _mm_srli_epi16(a_c, (16 - c)); -+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) -+} -+ -+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0 -+{ -+ //solution may be not optimal compared with the serial one -+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a -+ int32x4_t b_shift; -+ int32x4_t a_c; -+ b_shift = vshlq_n_s32( b, c); -+ a_c = vshlq_n_s32( a, (32 - c)); -+ a_c = _mm_srli_epi32(a_c, (32 - c)); -+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) -+} -+ -+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0 -+{ -+ //solution may be not optimal compared with the serial one -+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a -+ int64x2_t b_shift; -+ int64x2_t a_c; -+ b_shift = vshlq_n_s64( b, c); -+ a_c = vshlq_n_s64( a, (64 - c)); -+ a_c = _mm_srli_epi64(a_c, (64 - c)); -+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) -+} -+ -+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+#define vsliq_n_u8 vsliq_n_s8 -+ -+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+#define vsliq_n_u16 vsliq_n_s16 -+ -+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 -+#define vsliq_n_u32 vsliq_n_s32 -+ -+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 -+#define vsliq_n_u64 vsliq_n_s64 -+ -+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 -+#define vsliq_n_p8 vsliq_n_u8 -+ -+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 -+#define vsliq_n_p16 vsliq_n_u16 -+ -+// *********************************************************************************************** -+// ****************** Loads and stores of a single vector *************************************** -+// *********************************************************************************************** -+//Performs loads and stores of a single vector of some type. -+//******************************* Loads ******************************************************** -+// *********************************************************************************************** -+//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);. -+//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous. -+// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access -+//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead; -+#define LOAD_SI128(ptr) \ -+ ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)); -+ -+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+#define vld1q_u8 LOAD_SI128 -+ -+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+#define vld1q_u16 LOAD_SI128 -+ -+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+#define vld1q_u32 LOAD_SI128 -+ -+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld1q_u64 LOAD_SI128 -+ -+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+#define vld1q_s8 LOAD_SI128 -+ -+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+#define vld1q_s16 LOAD_SI128 -+ -+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+#define vld1q_s32 LOAD_SI128 -+ -+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld1q_s64 LOAD_SI128 -+ -+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers -+/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0] -+{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); -+__m128 f2; -+f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]); -+}*/ -+ -+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr) -+{ -+ if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned -+ return _mm_load_ps(ptr); -+ else -+ return _mm_loadu_ps(ptr); -+} -+ -+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] -+#define vld1q_p8 LOAD_SI128 -+ -+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] -+#define vld1q_p16 LOAD_SI128 -+ -+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] -+#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr)) -+ -+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] -+#define vld1_u16 vld1_u8 -+ -+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] -+#define vld1_u32 vld1_u8 -+ -+ -+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1_u64 vld1_u8 -+ -+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] -+#define vld1_s8 vld1_u8 -+ -+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] -+#define vld1_s16 vld1_u16 -+ -+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] -+#define vld1_s32 vld1_u32 -+ -+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1_s64 vld1_u64 -+ -+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); -+ -+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] -+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = *(ptr); -+ res.m64_f32[1] = *(ptr + 1); -+ return res; -+} -+ -+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] -+#define vld1_p8 vld1_u8 -+ -+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] -+#define vld1_p16 vld1_u16 -+ -+//*********************************************************************************************************** -+//******* Lane load functions - insert the data at vector's given position (lane) ************************* -+//*********************************************************************************************************** -+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) -+ -+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) -+ -+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) -+ -+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] -+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p; -+ -+ -+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) -+ -+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) -+ -+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) -+ -+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane) -+{ -+ //we need to deal with ptr 16bit NOT aligned case -+ __m128 p; -+ p = _mm_set1_ps(*(ptr)); -+ return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane)); -+} -+ -+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] -+#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane) -+ -+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) -+ -+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) -+ -+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] -+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane) -+{ -+ uint8x8_t res; -+ res = vec; -+ res.m64_u8[lane] = *(ptr); -+ return res; -+} -+ -+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane) -+{ -+ uint16x4_t res; -+ res = vec; -+ res.m64_u16[lane] = *(ptr); -+ return res; -+} -+ -+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane) -+{ -+ uint32x2_t res; -+ res = vec; -+ res.m64_u32[lane] = *(ptr); -+ return res; -+} -+ -+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] -+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = *(ptr); -+ return res; -+} -+ -+ -+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane) -+ -+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane) -+ -+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane) -+ -+float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane) -+{ -+ float32x2_t res; -+ res = vec; -+ res.m64_f32[lane] = *(ptr); -+ return res; -+} -+ -+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] -+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane) -+ -+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] -+#define vld1_lane_p8 vld1_lane_u8 -+ -+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] -+#define vld1_lane_p16 vld1_lane_s16 -+ -+// ****************** Load single value ( set all lanes of vector with same value from memory)********************** -+// ****************************************************************************************************************** -+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr)) -+ -+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr)) -+ -+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr)) -+ -+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)}; -+ return LOAD_SI128(val); -+} -+ -+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr)) -+ -+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr)) -+ -+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr)) -+ -+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr) -+ -+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+//current IA SIMD doesn't support float16, need to go to 32 bits -+ -+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr)) -+ -+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr)) -+ -+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr)) -+ -+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint8x8_t res; -+ int i; -+ for(i = 0; i<8; i++) { -+ res.m64_u8[i] = *(ptr); -+ } -+ return res; -+} -+ -+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint16x4_t res; -+ int i; -+ for(i = 0; i<4; i++) { -+ res.m64_u16[i] = *(ptr); -+ } -+ return res; -+} -+ -+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = *(ptr); -+ res.m64_u32[1] = *(ptr); -+ return res; -+} -+ -+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] -+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = *(ptr); -+ return res; -+} -+ -+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr) -+ -+ -+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr) -+ -+ -+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr) -+ -+ -+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] -+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr) -+ -+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] -+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = *(ptr); -+ res.m64_f32[1] = res.m64_f32[0]; -+ return res; // use last 64bits only -+} -+ -+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] -+#define vld1_dup_p8 vld1_dup_u8 -+ -+ -+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] -+#define vld1_dup_p16 vld1_dup_u16 -+ -+ -+//************************************************************************************* -+//********************************* Store ********************************************** -+//************************************************************************************* -+// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val); -+//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro -+#define STORE_SI128(ptr, val) \ -+ (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); -+ -+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] -+#define vst1q_u8 STORE_SI128 -+ -+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] -+#define vst1q_u16 STORE_SI128 -+ -+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] -+#define vst1q_u32 STORE_SI128 -+ -+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] -+#define vst1q_u64 STORE_SI128 -+ -+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] -+#define vst1q_s8 STORE_SI128 -+ -+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] -+#define vst1q_s16 STORE_SI128 -+ -+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] -+#define vst1q_s32 STORE_SI128 -+ -+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] -+#define vst1q_s64 STORE_SI128 -+ -+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val) -+{ -+ if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned -+ _mm_store_ps (ptr, val); -+ else -+ _mm_storeu_ps (ptr, val); -+} -+ -+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] -+#define vst1q_p8 vst1q_u8 -+ -+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] -+#define vst1q_p16 vst1q_u16 -+ -+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val) -+{ -+ int i; -+ for (i = 0; i<8; i++) { -+ *(ptr + i) = ((uint8_t*)&val)[i]; -+ } -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val) -+{ -+ int i; -+ for (i = 0; i<4; i++) { -+ *(ptr + i) = ((uint16_t*)&val)[i]; -+ } -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val) -+{ -+ int i; -+ for (i = 0; i<2; i++) { -+ *(ptr + i) = ((uint32_t*)&val)[i]; -+ } -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val) -+{ -+ *(ptr) = *((uint64_t*)&val); -+ //_mm_storel_epi64((__m128i*)ptr, val); -+ return; -+} -+ -+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] -+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val) -+ -+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] -+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val) -+ -+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] -+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val) -+ -+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] -+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val) -+ -+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val) -+{ -+ *(ptr) = val.m64_f32[0]; -+ *(ptr + 1) = val.m64_f32[1]; -+ return; -+} -+ -+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] -+#define vst1_p8 vst1_u8 -+ -+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] -+#define vst1_p16 vst1_u16 -+ -+//***********Store a lane of a vector into memory (extract given lane) ********************* -+//****************************************************************************************** -+void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) -+ -+void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) -+ -+void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] -+#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) -+ -+void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] -+#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) -+ -+void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) -+ -+void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) -+ -+void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] -+#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) -+ -+void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] -+#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) -+ -+void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane) -+{ -+ int32_t ilane; -+ ilane = _MM_EXTRACT_PS(val,lane); -+ *(ptr) = *((float*)&ilane); -+} -+ -+void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1q_lane_p8 vst1q_lane_u8 -+ -+void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1q_lane_p16 vst1q_lane_s16 -+ -+void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane) -+{ -+ *(ptr) = val.m64_u8[lane]; -+} -+ -+void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val.m64_u16[lane]; -+} -+ -+void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val.m64_u32[lane]; -+} -+ -+void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] -+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane) -+{ -+ *(ptr) = val.m64_u64[0]; -+} -+ -+void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane) -+ -+void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane) -+ -+void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] -+#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane) -+ -+ -+void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] -+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane) -+ -+ -+void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] -+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val.m64_f32[lane]; -+} -+ -+void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] -+#define vst1_lane_p8 vst1_lane_u8 -+ -+void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] -+#define vst1_lane_p16 vst1_lane_s16 -+ -+//*********************************************************************************************** -+//**************** Loads and stores of an N-element structure ********************************** -+//*********************************************************************************************** -+//These intrinsics load or store an n-element structure. The array structures are defined in the beginning -+//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions" -+//****************** 2 elements load ********************************************* -+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0] -+{ -+ uint8x16x2_t v; -+ v.val[0] = vld1q_u8(ptr); -+ v.val[1] = vld1q_u8((ptr + 16)); -+ v = vuzpq_s8(v.val[0], v.val[1]); -+ return v; -+} -+ -+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0] -+{ -+ uint16x8x2_t v; -+ v.val[0] = vld1q_u16( ptr); -+ v.val[1] = vld1q_u16( (ptr + 8)); -+ v = vuzpq_s16(v.val[0], v.val[1]); -+ return v; -+} -+ -+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0] -+{ -+ uint32x4x2_t v; -+ v.val[0] = vld1q_u32 ( ptr); -+ v.val[1] = vld1q_u32 ( (ptr + 4)); -+ v = vuzpq_s32(v.val[0], v.val[1]); -+ return v; -+} -+ -+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); -+#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr) -+ -+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr) -+ -+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr) -+ -+ -+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0] -+{ -+ float32x4x2_t v; -+ v.val[0] = vld1q_f32 (ptr); -+ v.val[1] = vld1q_f32 ((ptr + 4)); -+ v = vuzpq_f32(v.val[0], v.val[1]); -+ return v; -+} -+ -+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] -+#define vld2q_p8 vld2q_u8 -+ -+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] -+#define vld2q_p16 vld2q_u16 -+ -+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr) -+{ -+ uint8x8x2_t v; -+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; -+ __m128i ld128; -+ ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit -+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd); -+ vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); -+ return v; -+} -+ -+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr) -+{ -+ _NEON2SSE_ALIGN_16 uint16x4x2_t v; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; -+ __m128i ld128; -+ ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit -+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd); -+ vst1q_u16((v.val), ld128); -+ return v; -+} -+ -+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr) -+{ -+ _NEON2SSE_ALIGN_16 uint32x2x2_t v; -+ __m128i ld128; -+ ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit -+ ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); -+ vst1q_u32((v.val), ld128); -+ return v; -+} -+ -+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr) -+{ -+ uint64x1x2_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ return v; -+} -+ -+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr) -+ -+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr) -+ -+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr) -+ -+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr) -+ -+float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example -+ -+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr) -+{ -+ float32x2x2_t v; -+ v.val[0].m64_f32[0] = *(ptr); -+ v.val[0].m64_f32[1] = *(ptr + 2); -+ v.val[1].m64_f32[0] = *(ptr + 1); -+ v.val[1].m64_f32[1] = *(ptr + 3); -+ return v; -+} -+ -+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] -+#define vld2_p8 vld2_u8 -+ -+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] -+#define vld2_p16 vld2_u16 -+ -+//******************** Triplets *************************************** -+//********************************************************************* -+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0] -+{ -+ //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 -> -+ //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13 -+ //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14, -+ //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15 -+ uint8x16x3_t v; -+ __m128i tmp0, tmp1,tmp2, tmp3; -+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14}; -+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13}; -+ _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15}; -+ -+ v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15 -+ v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15 -+ v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15 -+ -+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11 -+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13 -+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15 -+ -+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15 -+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x -+ tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, -+ tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 -+ v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, -+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13, -+ -+ tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, -+ tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 -+ v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 -+ v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13, -+ v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, -+ v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 -+ tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 -+ tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14, -+ -+ tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, -+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, -+ v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 -+ v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 -+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, -+ tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, -+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15, -+ return v; -+} -+ -+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0] -+{ -+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 -+ uint16x8x3_t v; -+ __m128i tmp0, tmp1,tmp2, tmp3; -+ _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; -+ _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13}; -+ _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15}; -+ -+ v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7, -+ v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7 -+ v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7 -+ -+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5, -+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6 -+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7 -+ -+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6, -+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x -+ tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7 -+ tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0 -+ v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5, -+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5 -+ -+ tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7 -+ tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0 -+ v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0 -+ v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6, -+ v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5, -+ v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0, -+ tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0 -+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6, -+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6, -+ -+ tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0 -+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7, -+ v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0 -+ v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0 -+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7, -+ tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0 -+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7, -+ return v; -+} -+ -+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, -+ uint32x4x3_t v; -+ __m128i tmp0, tmp1,tmp2, tmp3; -+ v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3, -+ v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3 -+ v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3, -+ -+ tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2 -+ tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1 -+ tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3 -+ -+ tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2 -+ v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1 -+ tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1 -+ v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0, -+ v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2 -+ v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3 -+ return v; -+} -+ -+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr)) -+ -+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr)) -+ -+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr)) -+ -+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, -+ float32x4x3_t v; -+ __m128 tmp0, tmp1,tmp2, tmp3; -+ v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3, -+ v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3 -+ v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3, -+ -+ tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2 -+ tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1 -+ tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3 -+ tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2 -+ -+ v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1 -+ tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1 -+ v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0, -+ v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2 -+ v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3 -+ return v; -+} -+ -+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] -+#define vld3q_p8 vld3q_u8 -+ -+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] -+#define vld3q_p16 vld3q_u16 -+ -+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0] -+{ -+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 -+ uint8x8x3_t v; -+ __m128i val0, val1, val2, tmp0, tmp1; -+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14}; -+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0}; -+ val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7 -+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7 -+ -+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6, -+ tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x -+ val0 = _mm_slli_si128(tmp0,10); -+ val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0 -+ val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x -+ val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x -+ _M64(v.val[0], val0); -+ val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5, -+ val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0, -+ val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0 -+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0 -+ val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x -+ _M64(v.val[1], val1); -+ -+ tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0, -+ val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0 -+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7, -+ val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0] -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, -+ uint16x4x3_t v; -+ __m128i val0, val1, val2, tmp0, tmp1; -+ _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; -+ val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3 -+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x -+ -+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1 -+ tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3, -+ val0 = _mm_slli_si128(tmp0,10); -+ val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0, -+ val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1 -+ val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0 -+ val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x -+ _M64(v.val[0], val0); -+ -+ val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3 -+ val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0, -+ val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0, -+ val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0 -+ val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x -+ _M64(v.val[1], val1); -+ -+ tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0 -+ tmp1 = _mm_srli_si128(tmp1,4); -+ tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3, -+ val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3, -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0] -+{ -+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 -+ uint32x2x3_t v; -+ __m128i val0, val1, val2; -+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, -+ val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x -+ -+ val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0 -+ _M64(v.val[0], val0); -+ val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1, -+ val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1 -+ _M64(v.val[1], val1); -+ val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x, -+ _M64(v.val[2], val2); -+ return v; -+} -+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] -+{ -+ uint64x1x3_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ v.val[2].m64_u64[0] = *(ptr + 2); -+ return v; -+} -+ -+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr) -+ -+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr) -+ -+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr) -+ -+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr) -+ -+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr) -+{ -+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 -+ float32x2x3_t v; -+ v.val[0].m64_f32[0] = *(ptr); -+ v.val[0].m64_f32[1] = *(ptr + 3); -+ -+ v.val[1].m64_f32[0] = *(ptr + 1); -+ v.val[1].m64_f32[1] = *(ptr + 4); -+ -+ v.val[2].m64_f32[0] = *(ptr + 2); -+ v.val[2].m64_f32[1] = *(ptr + 5); -+ return v; -+} -+ -+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] -+#define vld3_p8 vld3_u8 -+ -+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] -+#define vld3_p16 vld3_u16 -+ -+//*************** Quadruples load ******************************** -+//***************************************************************** -+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0] -+{ -+ uint8x16x4_t v; -+ __m128i tmp3, tmp2, tmp1, tmp0; -+ -+ v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15 -+ v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15 -+ v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15 -+ v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15 -+ -+ tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7 -+ tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7 -+ tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15 -+ tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15 -+ -+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11 -+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15 -+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11 -+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15 -+ -+ tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9 -+ tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11 -+ tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13, -+ tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15 -+ -+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12 -+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13 -+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14 -+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15 -+ return v; -+} -+ -+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0] -+{ -+ uint16x8x4_t v; -+ __m128i tmp3, tmp2, tmp1, tmp0; -+ tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7 -+ tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7 -+ tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7 -+ tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7 -+ v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3, -+ v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3, -+ v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7 -+ v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7 -+ tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5 -+ tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7 -+ tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5 -+ tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7 -+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4, -+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5 -+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6, -+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7 -+ return v; -+} -+ -+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] -+{ -+ uint32x4x4_t v; -+ __m128i tmp3, tmp2, tmp1, tmp0; -+ v.val[0] = vld1q_u32 (ptr); -+ v.val[1] = vld1q_u32 ((ptr + 4)); -+ v.val[2] = vld1q_u32 ((ptr + 8)); -+ v.val[3] = vld1q_u32 ((ptr + 12)); -+ tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]); -+ tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]); -+ tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]); -+ tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]); -+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1); -+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1); -+ v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3); -+ v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3); -+ return v; -+} -+ -+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr) -+ -+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr) -+ -+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr) -+ -+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] -+{ -+ float32x4x4_t v; -+ __m128 tmp3, tmp2, tmp1, tmp0; -+ -+ v.val[0] = vld1q_f32 ((float*) ptr); -+ v.val[1] = vld1q_f32 ((float*) (ptr + 4)); -+ v.val[2] = vld1q_f32 ((float*) (ptr + 8)); -+ v.val[3] = vld1q_f32 ((float*) (ptr + 12)); -+ tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]); -+ tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]); -+ tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]); -+ tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]); -+ v.val[0] = _mm_movelh_ps(tmp0, tmp2); -+ v.val[1] = _mm_movehl_ps(tmp2, tmp0); -+ v.val[2] = _mm_movelh_ps(tmp1, tmp3); -+ v.val[3] = _mm_movehl_ps(tmp3, tmp1); -+ return v; -+} -+ -+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] -+#define vld4q_p8 vld4q_u8 -+ -+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] -+#define vld4q_p16 vld4q_s16 -+ -+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0] -+{ -+ uint8x8x4_t v; -+ __m128i sh0, sh1; -+ __m128i val0, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; -+ -+ val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1] -+ val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3] -+ -+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8); -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8); -+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29 -+ vst1q_u8(&v.val[0], val0 ); -+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31 -+ vst1q_u8(&v.val[2], val2 ); -+ return v; -+} -+ -+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0] -+{ -+ uint16x4x4_t v; -+ __m128i sh0, sh1; -+ __m128i val0, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7 -+ val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1] -+ val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3] -+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16); -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16); -+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13 -+ vst1q_u16(&v.val[0], val0 ); -+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15 -+ vst1q_u16(&v.val[2], val2 ); -+ return v; -+} -+ -+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr) -+{ -+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 -+ uint32x2x4_t v; -+ __m128i val0, val01, val2; -+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, -+ val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1 -+ val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1, -+ val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1 -+ vst1q_u32(&v.val[0], val01); -+ vst1q_u32(&v.val[2], val2 ); -+ return v; -+} -+ -+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] -+{ -+ uint64x1x4_t v; -+ v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1] -+ v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1] -+ v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3] -+ v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3] -+ return v; -+} -+ -+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr) -+ -+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr) -+ -+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr) -+ -+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr) -+ -+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0] -+{ -+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 -+ float32x2x4_t res; -+ res.val[0].m64_f32[0] = *(ptr); -+ res.val[0].m64_f32[1] = *(ptr + 4); -+ res.val[1].m64_f32[0] = *(ptr + 1); -+ res.val[1].m64_f32[1] = *(ptr + 5); -+ res.val[2].m64_f32[0] = *(ptr + 2); -+ res.val[2].m64_f32[1] = *(ptr + 6); -+ res.val[3].m64_f32[0] = *(ptr + 3); -+ res.val[3].m64_f32[1] = *(ptr + 7); -+ return res; -+} -+ -+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] -+#define vld4_p8 vld4_u8 -+ -+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] -+#define vld4_p16 vld4_u16 -+ -+//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes ******************* -+//******************************************************************************************************************* -+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0] -+{ -+ uint8x8x2_t v; -+ __m128i val0, val1; -+ val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x -+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x, -+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x -+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, -+ vst1q_u8(v.val, val0); -+ return v; -+} -+ -+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0] -+{ -+ uint16x4x2_t v; -+ __m128i val0, val1; -+ val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x -+ val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0) -+ _M64(v.val[0], val0); -+ val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1) -+ _M64(v.val[1], val1); -+ return v; -+} -+ -+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] -+{ -+ uint32x2x2_t v; -+ __m128i val0; -+ val0 = LOAD_SI128(ptr); //0,1,x,x -+ val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1 -+ vst1q_u32(v.val, val0); -+ return v; -+} -+ -+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld2_dup_u64 vld2_u64 -+ -+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr) -+ -+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr) -+ -+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr) -+ -+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] -+#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr) -+ -+float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] -+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] -+{ -+ float32x2x2_t v; -+ v.val[0].m64_f32[0] = *(ptr); //0,0 -+ v.val[0].m64_f32[1] = *(ptr); //0,0 -+ v.val[1].m64_f32[0] = *(ptr + 1); //1,1 -+ v.val[1].m64_f32[1] = *(ptr + 1); //1,1 -+ return v; -+} -+ -+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] -+#define vld2_dup_p8 vld2_dup_u8 -+ -+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] -+#define vld2_dup_p16 vld2_dup_s16 -+ -+//************* Duplicate (or propagate)triplets: ******************* -+//******************************************************************** -+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes -+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0] -+{ -+ uint8x8x3_t v; -+ __m128i val0, val1, val2; -+ val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x -+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x, -+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x, -+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, -+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x, -+ vst1q_u8(v.val, val0); -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0] -+{ -+ uint16x4x3_t v; -+ __m128i val0, val1, val2; -+ val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x -+ val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0) -+ val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1) -+ val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2) -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] -+{ -+ uint32x2x3_t v; -+ __m128i val0, val1, val2; -+ val2 = LOAD_SI128(ptr); //0,1,2,x -+ val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2 -+ val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2 -+ val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0 -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ return v; -+} -+ -+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] -+{ -+ uint64x1x3_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ v.val[2].m64_u64[0] = *(ptr + 2); -+ return v; -+} -+ -+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr) -+ -+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr) -+ -+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr) -+ -+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] -+#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr) -+ -+ -+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] -+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] -+{ -+ float32x2x3_t v; -+ int i; -+ for (i = 0; i<3; i++) { -+ v.val[i].m64_f32[0] = *(ptr + i); -+ v.val[i].m64_f32[1] = *(ptr + i); -+ } -+ return v; -+} -+ -+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_p8 vld3_dup_u8 -+ -+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] -+#define vld3_dup_p16 vld3_dup_s16 -+ -+ -+//************* Duplicate (or propagate) quadruples: ******************* -+//*********************************************************************** -+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes -+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ uint8x8x4_t v; -+ __m128i val0, val1, val2; -+ val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x -+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x, -+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3 -+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, -+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3 -+ vst1q_u8(&v.val[0], val0); -+ vst1q_u8(&v.val[2], val2); -+ return v; -+} -+ -+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ uint16x4x4_t v; -+ __m128i val0, val1, val2, val3; -+ val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x -+ val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0) -+ val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1) -+ val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2) -+ val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3) -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ _M64(v.val[3], val3); -+ return v; -+} -+ -+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ uint32x2x4_t v; -+ __m128i val0, val1, val2, val3; -+ val3 = LOAD_SI128(ptr); //0,1,2,3 -+ val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3 -+ val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3 -+ val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3 -+ val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2 -+ _M64(v.val[0], val0); -+ _M64(v.val[1], val1); -+ _M64(v.val[2], val2); -+ _M64(v.val[3], val3); -+ return v; -+} -+ -+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] -+{ -+ uint64x1x4_t v; -+ v.val[0].m64_u64[0] = *(ptr); -+ v.val[1].m64_u64[0] = *(ptr + 1); -+ v.val[2].m64_u64[0] = *(ptr + 2); -+ v.val[3].m64_u64[0] = *(ptr + 3); -+ return v; -+} -+ -+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr) -+ -+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr) -+ -+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr) -+ -+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] -+#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr) -+ -+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] -+{ -+ float32x2x4_t v; -+ int i; -+ for (i = 0; i<4; i++) { -+ v.val[i].m64_f32[0] = *(ptr + i); -+ v.val[i].m64_f32[1] = *(ptr + i); -+ } -+ return v; -+} -+ -+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_p8 vld4_dup_u8 -+ -+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] -+#define vld4_dup_p16 vld4_dup_u16 -+ -+ -+//********************************************************************************** -+//*******************Lane loads for an N-element structures *********************** -+//********************************************************************************** -+//********************** Lane pairs ************************************************ -+//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon -+//we assume src is 16 bit aligned -+ -+//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error -+//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined -+ -+//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0] -+{ -+ uint16x8x2_t v; -+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); -+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] -+{ -+ uint32x4x2_t v; -+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); -+ return v; -+} -+#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane) -+ -+//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane) -+{ -+ int16x8x2_t v; -+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); -+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane) -+ -+//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane) -+{ -+ int32x4x2_t v; -+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); -+ return v; -+} -+#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane) -+ -+//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] -+{ -+ float32x4x2_t v; -+ v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane) -+ -+//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] -+#define vld2q_lane_p16 vld2q_lane_u16 -+ -+//uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0] -+{ -+ uint8x8x2_t v; -+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane) -+ -+//uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane) -+{ -+ uint16x4x2_t v; -+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane) -+{ -+ uint32x2x2_t v; -+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane) -+ -+//int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] -+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] -+#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane) -+ -+//int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0] -+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane) -+ -+//int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0] -+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane) -+ -+//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane) -+{ -+ float32x2x2_t v; -+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); -+ return v; -+} -+#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane) -+ -+//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] -+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] -+#define vld2_lane_p8 vld2_lane_u8 -+ -+//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] -+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] -+#define vld2_lane_p16 vld2_lane_u16 -+ -+//*********** Lane triplets ********************** -+//************************************************* -+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon -+//we assume src is 16 bit aligned -+ -+//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ uint16x8x3_t v; -+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ uint32x4x3_t v; -+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane) -+ -+//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ int16x8x3_t v; -+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane) -+ -+//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ int32x4x3_t v; -+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); -+ return v; -+} -+#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane) -+ -+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+//current IA SIMD doesn't support float16 -+#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane) -+ -+ -+//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] -+{ -+ float32x4x3_t v; -+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); -+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); -+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); -+ return v; -+} -+#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane) -+ -+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] -+#define vld3q_lane_p16 vld3q_lane_u16 -+ -+//uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ uint8x8x3_t v; -+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); -+ return v; -+} -+#define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane) -+ -+//uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ uint16x4x3_t v; -+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); -+ return v; -+} -+#define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ //need to merge into 128 bit anyway -+ uint32x2x3_t v; -+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);; -+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);; -+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);; -+ return v; -+} -+#define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane) -+ -+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane) -+ -+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane) -+ -+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane) -+ -+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] -+{ -+ float32x2x3_t v; -+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); -+ return v; -+} -+#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane) -+ -+//poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_p8 vld3_lane_u8 -+ -+//poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] -+#define vld3_lane_p16 vld3_lane_u16 -+ -+//******************* Lane Quadruples load *************************** -+//********************************************************************* -+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon -+//we assume src is 16 bit aligned -+ -+//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane) -+{ -+ uint16x8x4_t v; -+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); -+ v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane); -+ return v; -+} -+#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane) -+{ -+ uint32x4x4_t v; -+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); -+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); -+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); -+ v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane); -+ return v; -+} -+#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane) -+ -+//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane) -+ -+//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane) -+ -+//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane) -+{ -+ float32x4x4_t v; -+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); -+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); -+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); -+ v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane); -+ return v; -+} -+#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane) -+ -+//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+#define vld4q_lane_p16 vld4q_lane_u16 -+ -+//uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane) -+{ -+ uint8x8x4_t v; -+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane) -+ -+//uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane) -+{ -+ uint16x4x4_t v; -+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane) -+ -+//uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane) -+{ -+ uint32x2x4_t v; -+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane) -+ -+//int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); -+#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane) -+ -+//int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); -+#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane) -+ -+//int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); -+#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane) -+ -+//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); -+//current IA SIMD doesn't support float16 -+ -+//float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane) -+{ -+ //serial solution may be faster -+ float32x2x4_t v; -+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane); -+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane); -+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane); -+ v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane); -+ return v; -+} -+#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane) -+ -+//poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); -+#define vld4_lane_p8 vld4_lane_u8 -+ -+//poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); -+#define vld4_lane_p16 vld4_lane_u16 -+ -+//******************* Store duplets ********************************************* -+//******************************************************************************** -+//here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function -+//If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions -+//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val) -+{ -+ uint8x16x2_t v; -+ v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]); -+ vst1q_u8 (ptr, v.val[0]); -+ vst1q_u8 ((ptr + 16), v.val[1]); -+} -+#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val) -+ -+//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val) -+{ -+ uint16x8x2_t v; -+ v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]); -+ vst1q_u16 (ptr, v.val[0]); -+ vst1q_u16 ((ptr + 8), v.val[1]); -+} -+#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val) -+ -+//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val) -+{ -+ uint32x4x2_t v; -+ v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]); -+ vst1q_u32 (ptr, v.val[0]); -+ vst1q_u32 ((ptr + 4), v.val[1]); -+} -+#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val) -+ -+//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0] -+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); -+#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val) -+ -+//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0] -+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); -+#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val) -+ -+//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0] -+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); -+#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val) -+ -+//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0] -+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0] -+_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val) -+{ -+ float32x4x2_t v; -+ v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]); -+ v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]); -+ vst1q_f32 (ptr, v.val[0]); -+ vst1q_f32 ((ptr + 4), v.val[1]); -+} -+#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val) -+ -+//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0] -+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); -+#define vst2q_p8 vst2q_u8 -+ -+//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0] -+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); -+#define vst2q_p16 vst2q_u16 -+ -+//void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val) -+{ -+ __m128i v0; -+ v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ vst1q_u8 (ptr, v0); -+} -+#define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val) -+ -+//void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val) -+{ -+ __m128i v0; -+ v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ vst1q_u16 (ptr, v0); -+} -+#define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val) -+ -+//void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val) -+{ -+ __m128i v0; -+ v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ vst1q_u32 (ptr, v0); -+} -+#define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val) -+ -+ -+//void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0] -+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); -+_NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val) -+{ -+ *(ptr) = val->val[0].m64_u64[0]; -+ *(ptr + 1) = val->val[1].m64_u64[0]; -+} -+#define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val) -+ -+//void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0] -+#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val) -+ -+//void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0] -+#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val) -+ -+//void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0] -+#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val) -+ -+//void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); -+#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val) -+ -+//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0] -+//current IA SIMD doesn't support float16 -+ -+//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0] -+_NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val) -+{ -+ *(ptr) = val->val[0].m64_f32[0]; -+ *(ptr + 1) = val->val[1].m64_f32[0]; -+ *(ptr + 2) = val->val[0].m64_f32[1]; -+ *(ptr + 3) = val->val[1].m64_f32[1]; -+} -+#define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val) -+ -+//void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] -+#define vst2_p8 vst2_u8 -+ -+//void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] -+#define vst2_p16 vst2_u16 -+ -+//******************** Triplets store ***************************************** -+//****************************************************************************** -+//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val) -+{ -+ uint8x16x3_t v; -+ __m128i v0,v1,v2, cff, bldmask; -+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10}; -+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15}; -+ -+ v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22 -+ v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46 -+ v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34 -+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding -+ cff = _mm_cmpeq_epi8(v0, v0); //all ff -+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff); -+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); -+ vst1q_u8(ptr, v.val[0]); -+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff); -+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); -+ vst1q_u8((ptr + 16), v.val[1]); -+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff); -+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); -+ vst1q_u8((ptr + 32), v.val[2]); -+} -+#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val) -+ -+//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val) -+{ -+ uint16x8x3_t v; -+ __m128i v0,v1,v2, cff, bldmask; -+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11}; -+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff}; -+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15}; -+ -+ v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10 -+ v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22, -+ v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19 -+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding -+ cff = _mm_cmpeq_epi16(v0, v0); //all ff -+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff); -+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); -+ vst1q_u16(ptr, v.val[0]); -+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff); -+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); -+ vst1q_u16((ptr + 8), v.val[1]); -+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding -+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding -+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff); -+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); -+ vst1q_u16((ptr + 16), v.val[2]); -+} -+#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val) -+ -+//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val) -+{ -+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3 -+ uint32x4x3_t v; -+ __m128i tmp0, tmp1,tmp2; -+ tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1 -+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3 -+ tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1 -+ v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2, -+ v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3 -+ v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3 -+ tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1 -+ v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1, -+ -+ vst1q_u32(ptr, v.val[0]); -+ vst1q_u32((ptr + 4), v.val[1]); -+ vst1q_u32((ptr + 8), v.val[2]); -+} -+#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val) -+ -+//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val); -+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); -+#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val) -+ -+//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val); -+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); -+#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val) -+ -+//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val); -+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); -+#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val) -+ -+//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] -+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] -+_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val) -+{ -+ float32x4x3_t v; -+ __m128 tmp0, tmp1,tmp2; -+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1 -+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3 -+ tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1 -+ v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2, -+ v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3 -+ v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3 -+ tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1 -+ v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1, -+ -+ vst1q_f32( ptr, v.val[0]); -+ vst1q_f32( (ptr + 4), v.val[1]); -+ vst1q_f32( (ptr + 8), v.val[2]); -+} -+#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val) -+ -+//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0] -+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); -+#define vst3q_p8 vst3q_u8 -+ -+//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] -+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); -+#define vst3q_p16 vst3q_u16 -+ -+//void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val) -+{ -+ __m128i tmp, sh0, sh1, val0, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5}; -+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0}; -+ _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0}; -+ _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0}; -+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) ); -+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15) -+ val2 = _pM128i(val->val[2]); -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); -+ val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel); -+ vst1q_u8(ptr, val0); //store as 128 bit structure -+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15) -+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); -+ val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel); -+ _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory -+} -+#define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val) -+ -+//void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val) -+{ -+ __m128i tmp, val0, val1, val2; -+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13}; -+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0}; -+ _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1] -+ _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0] -+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1])); -+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); -+ val2 = _pM128i(val->val[2]); -+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); -+ val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f); -+ vst1q_u16(ptr, val0); //store as 128 bit structure -+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); -+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); -+ val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order -+ _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory -+} -+#define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val) -+ -+//void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val) -+{ -+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -+ __m128i val0, val1; -+ val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5 -+ val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5 -+ val1 = _mm_srli_si128(val0, 8); //4,5, x,x -+ _M64((*(__m64_128*)(ptr + 4)), val1); -+ val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2 -+ val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3 -+ vst1q_u32(ptr, val0); //store as 128 bit structure -+} -+#define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val) -+ -+//void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val) -+{ -+ *(ptr) = val->val[0].m64_u64[0]; -+ *(ptr + 1) = val->val[1].m64_u64[0]; -+ *(ptr + 2) = val->val[2].m64_u64[0]; -+} -+#define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val) -+ -+//void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0] -+#define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val) -+ -+//void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0] -+#define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val) -+ -+//void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] -+#define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val) -+ -+//void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0] -+#define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val) -+ -+//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] -+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+//void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] -+_NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val) -+{ -+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5 -+ *(ptr) = val->val[0].m64_f32[0]; -+ *(ptr + 1) = val->val[1].m64_f32[0]; -+ *(ptr + 2) = val->val[2].m64_f32[0]; -+ *(ptr + 3) = val->val[0].m64_f32[1]; -+ *(ptr + 4) = val->val[1].m64_f32[1]; -+ *(ptr + 5) = val->val[2].m64_f32[1]; -+} -+#define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val) -+ -+//void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0] -+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); -+#define vst3_p8 vst3_u8 -+ -+//void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] -+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); -+#define vst3_p16 vst3_s16 -+ -+//*************** Quadruples store ******************************** -+//********************************************************************* -+//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val) -+{ -+ __m128i tmp1, tmp2, res; -+ tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 -+ tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 -+ res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15 -+ vst1q_u8(ptr, res); -+ res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31 -+ vst1q_u8((ptr + 16), res); -+ tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); // -+ tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); // -+ res = _mm_unpacklo_epi16(tmp1, tmp2); // -+ vst1q_u8((ptr + 32), res); -+ res = _mm_unpackhi_epi16(tmp1, tmp2); // -+ vst1q_u8((ptr + 48), res); -+} -+#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val) -+ -+//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val) -+{ -+ uint16x8x4_t v; -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2); -+ v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2); -+ tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2); -+ v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2); -+ vst1q_u16(ptr, v.val[0]); -+ vst1q_u16((ptr + 8), v.val[1]); -+ vst1q_u16((ptr + 16),v.val[2]); -+ vst1q_u16((ptr + 24), v.val[3]); -+} -+#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val) -+ -+//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val) -+{ -+ uint16x8x4_t v; -+ __m128i tmp1, tmp2; -+ tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2); -+ v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2); -+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 -+ tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 -+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2); -+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2); -+ vst1q_u32(ptr, v.val[0]); -+ vst1q_u32((ptr + 4), v.val[1]); -+ vst1q_u32((ptr + 8), v.val[2]); -+ vst1q_u32((ptr + 12), v.val[3]); -+} -+#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val) -+ -+//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val); -+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); -+#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val) -+ -+//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val); -+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); -+#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val) -+ -+//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val); -+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); -+#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val) -+ -+//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] -+_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val) -+{ -+ __m128 tmp3, tmp2, tmp1, tmp0; -+ float32x4x4_t v; -+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); -+ tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]); -+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); -+ tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]); -+ v.val[0] = _mm_movelh_ps(tmp0, tmp2); -+ v.val[1] = _mm_movehl_ps(tmp2, tmp0); -+ v.val[2] = _mm_movelh_ps(tmp1, tmp3); -+ v.val[3] = _mm_movehl_ps(tmp3, tmp1); -+ vst1q_f32(ptr, v.val[0]); -+ vst1q_f32((ptr + 4), v.val[1]); -+ vst1q_f32((ptr + 8), v.val[2]); -+ vst1q_f32((ptr + 12), v.val[3]); -+} -+#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val) -+ -+//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0] -+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); -+#define vst4q_p8 vst4q_u8 -+ -+//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] -+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); -+#define vst4q_p16 vst4q_s16 -+ -+//void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val) -+{ -+ __m128i sh0, sh1, val0, val2; -+ sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7, -+ sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7 -+ val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3, -+ val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7 -+ vst1q_u8(ptr, val0); -+ vst1q_u8((ptr + 16), val2); -+} -+#define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val) -+ -+//void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val) -+{ -+ __m128i sh0, sh1, val0, val2; -+ sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1, -+ sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3 -+ val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3 -+ val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3 -+ vst1q_u16(ptr, val0); //store as 128 bit structure -+ vst1q_u16((ptr + 8), val2); -+} -+#define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val) -+ -+//void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val) -+{ -+ //0,4, 1,5, 2,6, 3,7 -+ __m128i sh0, sh1, val0, val1; -+ sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5 -+ sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7 -+ val0 = _mm_unpacklo_epi64(sh0,sh1); // -+ val1 = _mm_unpackhi_epi64(sh0,sh1); // -+ vst1q_u32(ptr, val0); //store as 128 bit structure -+ vst1q_u32((ptr + 4), val1); -+} -+#define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val) -+ -+//void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val) -+{ -+ *(ptr) = val->val[0].m64_u64[0]; -+ *(ptr + 1) = val->val[1].m64_u64[0]; -+ *(ptr + 2) = val->val[2].m64_u64[0]; -+ *(ptr + 3) = val->val[3].m64_u64[0]; -+} -+#define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val) -+ -+//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0] -+#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val) -+ -+//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0] -+#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val) -+ -+//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0] -+#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val) -+ -+//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] -+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); -+#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val) -+ -+//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); -+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example -+ -+//void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] -+_NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val) -+{ -+ //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7 -+ *(ptr) = val->val[0].m64_f32[0]; -+ *(ptr + 1) = val->val[1].m64_f32[0]; -+ *(ptr + 2) = val->val[2].m64_f32[0]; -+ *(ptr + 3) = val->val[3].m64_f32[0]; -+ *(ptr + 4) = val->val[0].m64_f32[1]; -+ *(ptr + 5) = val->val[1].m64_f32[1]; -+ *(ptr + 6) = val->val[2].m64_f32[1]; -+ *(ptr + 7) = val->val[3].m64_f32[1]; -+} -+#define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val) -+ -+//void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0] -+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); -+#define vst4_p8 vst4_u8 -+ -+//void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] -+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); -+#define vst4_p16 vst4_u16 -+ -+//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors ********************* -+//******************************************************************************************************************** -+//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane) -+{ -+ vst1q_lane_s16(ptr, val->val[0], lane); -+ vst1q_lane_s16((ptr + 1), val->val[1], lane); -+} -+#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_u32(ptr, val->val[0], lane); -+ vst1q_lane_u32((ptr + 1), val->val[1], lane); -+} -+#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] -+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); -+#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane) -+ -+//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0] -+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); -+#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane) -+ -+//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] -+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_f32(ptr, val->val[0], lane); -+ vst1q_lane_f32((ptr + 1), val->val[1], lane); -+} -+#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane) -+ -+//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] -+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); -+#define vst2q_lane_p16 vst2q_lane_s16 -+ -+//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] -+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0] -+{ -+ *(ptr) = val->val[0].m64_u8[lane]; -+ *(ptr + 1) = val->val[1].m64_u8[lane]; -+} -+#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane) -+ -+//void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] -+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val->val[0].m64_u16[lane]; -+ *(ptr + 1) = val->val[1].m64_u16[lane]; -+} -+#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] -+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_u32[lane]; -+ *(ptr + 1) = val->val[1].m64_u32[lane]; -+} -+#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] -+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); -+#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane) -+ -+//void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] -+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); -+#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane) -+ -+//void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] -+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); -+#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane) -+ -+//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] -+//current IA SIMD doesn't support float16 -+ -+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] -+_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_f32[lane]; -+ *(ptr + 1) = val->val[1].m64_f32[lane]; -+} -+#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane) -+ -+//void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] -+#define vst2_lane_p8 vst2_lane_u8 -+ -+//void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] -+#define vst2_lane_p16 vst2_lane_u16 -+ -+//************************* Triple lanes stores ******************************************************* -+//******************************************************************************************************* -+//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane) -+{ -+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane); -+ vst1q_lane_u16((ptr + 2), val->val[2], lane); -+} -+#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane) -+{ -+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane); -+ vst1q_lane_u32((ptr + 2), val->val[2], lane); -+} -+#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); -+#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane) -+ -+//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); -+#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane) -+ -+//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] -+_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_f32(ptr, val->val[0], lane); -+ vst1q_lane_f32((ptr + 1), val->val[1], lane); -+ vst1q_lane_f32((ptr + 2), val->val[2], lane); -+} -+#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] -+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); -+#define vst3q_lane_p16 vst3q_lane_s16 -+ -+//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane) -+{ -+ *(ptr) = val->val[0].m64_u8[lane]; -+ *(ptr + 1) = val->val[1].m64_u8[lane]; -+ *(ptr + 2) = val->val[2].m64_u8[lane]; -+} -+#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane) -+ -+//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val->val[0].m64_u16[lane]; -+ *(ptr + 1) = val->val[1].m64_u16[lane]; -+ *(ptr + 2) = val->val[2].m64_u16[lane]; -+} -+#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] -+_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_u32[lane]; -+ *(ptr + 1) = val->val[1].m64_u32[lane]; -+ *(ptr + 2) = val->val[2].m64_u32[lane]; -+} -+#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); -+#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane) -+ -+//void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); -+#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane) -+ -+//void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); -+#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane) -+ -+//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); -+_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_f32[lane]; -+ *(ptr + 1) = val->val[1].m64_f32[lane]; -+ *(ptr + 2) = val->val[2].m64_f32[lane]; -+} -+#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); -+#define vst3_lane_p8 vst3_lane_u8 -+ -+//void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] -+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); -+#define vst3_lane_p16 vst3_lane_s16 -+ -+//******************************** Quadruple lanes stores *********************************************** -+//******************************************************************************************************* -+//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane) -+{ -+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane); -+ vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane); -+} -+#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane) -+{ -+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane); -+ vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane); -+} -+#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); -+#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane) -+ -+//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); -+#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane) -+ -+//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); -+//current IA SIMD doesn't support float16 -+ -+//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane) -+{ -+ vst1q_lane_f32(ptr, val->val[0], lane); -+ vst1q_lane_f32((ptr + 1), val->val[1], lane); -+ vst1q_lane_f32((ptr + 2), val->val[2], lane); -+ vst1q_lane_f32((ptr + 3), val->val[3], lane); -+} -+#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] -+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); -+#define vst4q_lane_p16 vst4q_lane_u16 -+ -+//void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane) -+{ -+ *(ptr) = val->val[0].m64_u8[lane]; -+ *(ptr + 1) = val->val[1].m64_u8[lane]; -+ *(ptr + 2) = val->val[2].m64_u8[lane]; -+ *(ptr + 3) = val->val[3].m64_u8[lane]; -+} -+#define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane) -+ -+//void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane) -+{ -+ *(ptr) = val->val[0].m64_u16[lane]; -+ *(ptr + 1) = val->val[1].m64_u16[lane]; -+ *(ptr + 2) = val->val[2].m64_u16[lane]; -+ *(ptr + 3) = val->val[3].m64_u16[lane]; -+} -+#define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane) -+ -+//void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_u32[lane]; -+ *(ptr + 1) = val->val[1].m64_u32[lane]; -+ *(ptr + 2) = val->val[2].m64_u32[lane]; -+ *(ptr + 3) = val->val[3].m64_u32[lane]; -+} -+#define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane) -+ -+//void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane) -+ -+//void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane) -+ -+//void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane) -+ -+//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); -+//current IA SIMD doesn't support float16 -+ -+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane) -+{ -+ *(ptr) = val->val[0].m64_f32[lane]; -+ *(ptr + 1) = val->val[1].m64_f32[lane]; -+ *(ptr + 2) = val->val[2].m64_f32[lane]; -+ *(ptr + 3) = val->val[3].m64_f32[lane]; -+} -+#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane) -+ -+//void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); -+#define vst4_lane_p8 vst4_lane_u8 -+ -+//void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] -+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); -+#define vst4_lane_p16 vst4_lane_u16 -+ -+//************************************************************************************************** -+//************************ Extract lanes from a vector ******************************************** -+//************************************************************************************************** -+//These intrinsics extract a single lane (element) from a vector. -+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+#define vget_lane_u8(vec, lane) vec.m64_u8[lane] -+ -+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] -+#define vget_lane_u16(vec, lane) vec.m64_u16[lane] -+ -+ -+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+#define vget_lane_u32(vec, lane) vec.m64_u32[lane] -+ -+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] -+#define vget_lane_s8(vec, lane) vec.m64_i8[lane] -+ -+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] -+#define vget_lane_s16(vec, lane) vec.m64_i16[lane] -+ -+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+#define vget_lane_s32(vec, lane) vec.m64_i32[lane] -+ -+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] -+#define vget_lane_p8 vget_lane_u8 -+ -+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] -+#define vget_lane_p16 vget_lane_u16 -+ -+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] -+#define vget_lane_f32(vec, lane) vec.m64_f32[lane] -+ -+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+#define vgetq_lane_u8 _MM_EXTRACT_EPI8 -+ -+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] -+#define vgetq_lane_u16 _MM_EXTRACT_EPI16 -+ -+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+#define vgetq_lane_u32 _MM_EXTRACT_EPI32 -+ -+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] -+#define vgetq_lane_s8 vgetq_lane_u8 -+ -+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] -+#define vgetq_lane_s16 vgetq_lane_u16 -+ -+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+#define vgetq_lane_s32 vgetq_lane_u32 -+ -+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] -+#define vgetq_lane_p8 vgetq_lane_u8 -+ -+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] -+#define vgetq_lane_p16 vgetq_lane_u16 -+ -+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] -+_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane) -+{ -+ int32_t ilane; -+ ilane = _MM_EXTRACT_PS(vec,lane); -+ return *(float*)&ilane; -+} -+ -+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+#define vget_lane_s64(vec, lane) vec.m64_i64[0] -+ -+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 -+#define vget_lane_u64(vec, lane) vec.m64_u64[0] -+ -+ -+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+#define vgetq_lane_s64 (int64_t) vgetq_lane_u64 -+ -+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 -+#define vgetq_lane_u64 _MM_EXTRACT_EPI64 -+ -+// ***************** Set lanes within a vector ******************************************** -+// ************************************************************************************** -+//These intrinsics set a single lane (element) within a vector. -+//same functions as vld1_lane_xx ones, but take the value to be set directly. -+ -+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane) -+{ -+ uint8_t val; -+ val = value; -+ return vld1_lane_u8(&val, vec, lane); -+} -+ -+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane) -+{ -+ uint16_t val; -+ val = value; -+ return vld1_lane_u16(&val, vec, lane); -+} -+ -+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane) -+{ -+ uint32_t val; -+ val = value; -+ return vld1_lane_u32(&val, vec, lane); -+} -+ -+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane) -+{ -+ int8_t val; -+ val = value; -+ return vld1_lane_s8(&val, vec, lane); -+} -+ -+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane) -+{ -+ int16_t val; -+ val = value; -+ return vld1_lane_s16(&val, vec, lane); -+} -+ -+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane) -+{ -+ int32_t val; -+ val = value; -+ return vld1_lane_s32(&val, vec, lane); -+} -+ -+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 -+#define vset_lane_p8 vset_lane_u8 -+ -+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 -+#define vset_lane_p16 vset_lane_u16 -+ -+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane) -+{ -+ float32_t val; -+ val = value; -+ return vld1_lane_f32(&val, vec, lane); -+} -+ -+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane) -+{ -+ uint8_t val; -+ val = value; -+ return vld1q_lane_u8(&val, vec, lane); -+} -+ -+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane) -+{ -+ uint16_t val; -+ val = value; -+ return vld1q_lane_u16(&val, vec, lane); -+} -+ -+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane) -+{ -+ uint32_t val; -+ val = value; -+ return vld1q_lane_u32(&val, vec, lane); -+} -+ -+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane) -+{ -+ int8_t val; -+ val = value; -+ return vld1q_lane_s8(&val, vec, lane); -+} -+ -+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane) -+{ -+ int16_t val; -+ val = value; -+ return vld1q_lane_s16(&val, vec, lane); -+} -+ -+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane) -+{ -+ int32_t val; -+ val = value; -+ return vld1q_lane_s32(&val, vec, lane); -+} -+ -+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 -+#define vsetq_lane_p8 vsetq_lane_u8 -+ -+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 -+#define vsetq_lane_p16 vsetq_lane_u16 -+ -+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 -+_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane) -+{ -+ float32_t val; -+ val = value; -+ return vld1q_lane_f32(&val, vec, lane); -+} -+ -+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane) -+{ -+ int64_t val; -+ val = value; -+ return vld1_lane_s64(&val, vec, lane); -+} -+ -+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane) -+{ -+ uint64_t val; -+ val = value; -+ return vld1_lane_u64(&val, vec, lane); -+} -+ -+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane) -+{ -+ uint64_t val; -+ val = value; -+ return vld1q_lane_s64(&val, vec, lane); -+} -+ -+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 -+#define vsetq_lane_u64 vsetq_lane_s64 -+ -+// ******************************************************************************* -+// **************** Initialize a vector from bit pattern *************************** -+// ******************************************************************************* -+//These intrinsics create a vector from a literal bit pattern. -+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s8(a) (*(__m64_128*)&(a)) -+ -+ -+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s16 vcreate_s8 -+ -+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s32 vcreate_s8 -+ -+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 -+//no IA32 SIMD avalilable -+ -+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_f32(a) (*(__m64_128*)&(a)) -+ -+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u8 vcreate_s8 -+ -+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u16 vcreate_s16 -+ -+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u32 vcreate_s32 -+ -+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_u64 vcreate_s8 -+ -+ -+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_p8 vcreate_u8 -+ -+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_p16 vcreate_u16 -+ -+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 -+#define vcreate_s64 vcreate_u64 -+ -+//********************* Set all lanes to same value ******************************** -+//********************************************************************************* -+//These intrinsics set all lanes to the same value. -+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint8x8_t res; -+ int i; -+ for (i = 0; i<8; i++) { -+ res.m64_u8[i] = value; -+ } -+ return res; -+} -+ -+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint16x4_t res; -+ int i; -+ for (i = 0; i<4; i++) { -+ res.m64_u16[i] = value; -+ } -+ return res; -+} -+ -+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = value; -+ res.m64_u32[1] = value; -+ return res; -+} -+ -+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int8x8_t res; -+ int i; -+ for (i = 0; i<8; i++) { -+ res.m64_i8[i] = value; -+ } -+ return res; -+} -+ -+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int16x4_t res; -+ int i; -+ for (i = 0; i<4; i++) { -+ res.m64_i16[i] = value; -+ } -+ return res; -+} -+ -+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t res; -+ res.m64_i32[0] = value; -+ res.m64_i32[1] = value; -+ return res; -+} -+ -+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 -+#define vdup_n_p8 vdup_n_u8 -+ -+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 -+#define vdup_n_p16 vdup_n_s16 -+ -+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 -+_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = value; -+ res.m64_f32[1] = value; -+ return res; -+} -+ -+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value)) -+ -+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value)) -+ -+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value)) -+ -+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 -+#define vdupq_n_s8 _mm_set1_epi8 -+ -+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 -+#define vdupq_n_s16 _mm_set1_epi16 -+ -+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 -+#define vdupq_n_s32 _mm_set1_epi32 -+ -+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+#define vdupq_n_p8 vdupq_n_u8 -+ -+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+#define vdupq_n_p16 vdupq_n_u16 -+ -+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 -+#define vdupq_n_f32 _mm_set1_ps -+ -+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = value; -+ return res; -+} -+ -+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value) -+{ -+ uint64x1_t res; -+ res.m64_u64[0] = value; -+ return res; -+} -+ -+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value) -+{ -+ _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate -+ return LOAD_SI128(value2); -+} -+ -+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value) -+{ -+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate -+ return LOAD_SI128(val); -+} -+ -+//**** Set all lanes to same value ************************ -+//Same functions as above - just aliaces.******************** -+//Probably they reflect the fact that 128-bit functions versions use VMOV instruction ********** -+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 -+#define vmov_n_u8 vdup_n_s8 -+ -+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 -+#define vmov_n_u16 vdup_n_s16 -+ -+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 -+#define vmov_n_u32 vdup_n_u32 -+ -+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 -+#define vmov_n_s8 vdup_n_s8 -+ -+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 -+#define vmov_n_s16 vdup_n_s16 -+ -+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 -+#define vmov_n_s32 vdup_n_s32 -+ -+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 -+#define vmov_n_p8 vdup_n_u8 -+ -+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 -+#define vmov_n_p16 vdup_n_s16 -+ -+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 -+#define vmov_n_f32 vdup_n_f32 -+ -+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 -+#define vmovq_n_u8 vdupq_n_u8 -+ -+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 -+#define vmovq_n_u16 vdupq_n_s16 -+ -+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 -+#define vmovq_n_u32 vdupq_n_u32 -+ -+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 -+#define vmovq_n_s8 vdupq_n_s8 -+ -+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 -+#define vmovq_n_s16 vdupq_n_s16 -+ -+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 -+#define vmovq_n_s32 vdupq_n_s32 -+ -+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 -+#define vmovq_n_p8 vdupq_n_u8 -+ -+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 -+#define vmovq_n_p16 vdupq_n_s16 -+ -+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 -+#define vmovq_n_f32 vdupq_n_f32 -+ -+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 -+#define vmov_n_s64 vdup_n_s64 -+ -+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 -+#define vmov_n_u64 vdup_n_u64 -+ -+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 -+#define vmovq_n_s64 vdupq_n_s64 -+ -+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 -+#define vmovq_n_u64 vdupq_n_u64 -+ -+//**************Set all lanes to the value of one lane of a vector ************* -+//**************************************************************************** -+//here shuffle is better solution than lane extraction followed by set1 function -+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) -+{ -+ uint8x8_t res; -+ uint8_t valane; -+ int i = 0; -+ valane = vec.m64_u8[lane]; -+ for (i = 0; i<8; i++) { -+ res.m64_u8[i] = valane; -+ } -+ return res; -+} -+ -+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) -+{ -+ uint16x4_t res; -+ uint16_t valane; -+ valane = vec.m64_u16[lane]; -+ res.m64_u16[0] = valane; -+ res.m64_u16[1] = valane; -+ res.m64_u16[2] = valane; -+ res.m64_u16[3] = valane; -+ return res; -+} -+ -+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane) -+{ -+ uint32x2_t res; -+ res.m64_u32[0] = vec.m64_u32[lane]; -+ res.m64_u32[1] = res.m64_u32[0]; -+ return res; -+} -+ -+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+#define vdup_lane_s8 vdup_lane_u8 -+ -+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+#define vdup_lane_s16 vdup_lane_u16 -+ -+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+#define vdup_lane_s32 vdup_lane_u32 -+ -+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] -+#define vdup_lane_p8 vdup_lane_u8 -+ -+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] -+#define vdup_lane_p16 vdup_lane_s16 -+ -+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] -+_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = vec.m64_f32[lane]; -+ res.m64_f32[1] = res.m64_f32[0]; -+ return res; -+} -+ -+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0] -+{ -+ _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane}; -+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8); -+} -+ -+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0] -+{ -+ //we could use 8bit shuffle for 16 bit as well -+ const int8_t lane16 = ((int8_t) lane) << 1; -+ _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, -+ lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1}; -+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16); -+} -+ -+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+#define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6)) -+ -+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+#define vdupq_lane_s8 vdupq_lane_u8 -+ -+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+#define vdupq_lane_s16 vdupq_lane_u16 -+ -+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+#define vdupq_lane_s32 vdupq_lane_u32 -+ -+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] -+#define vdupq_lane_p8 vdupq_lane_u8 -+ -+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] -+#define vdupq_lane_p16 vdupq_lane_s16 -+ -+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] -+#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane)) -+ -+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+#define vdup_lane_s64(vec,lane) vec -+ -+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 -+#define vdup_lane_u64(vec,lane) vec -+ -+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane) -+{ -+ __m128i vec128; -+ vec128 = _pM128i(vec); -+ return _mm_unpacklo_epi64(vec128,vec128); -+} -+ -+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 -+#define vdupq_lane_u64 vdupq_lane_s64 -+ -+// ******************************************************************** -+// ******************** Combining vectors ***************************** -+// ******************************************************************** -+//These intrinsics join two 64 bit vectors into a single 128bit vector. -+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 -+#define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 -+#define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 -+#define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 -+#define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ) -+ -+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 -+//current IA SIMD doesn't support float16 -+ -+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 -+_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high) -+{ -+ __m128i res; -+ res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) ); -+ return _M128(res); -+} -+ -+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 -+#define vcombine_u8 vcombine_s8 -+ -+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 -+#define vcombine_u16 vcombine_s16 -+ -+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 -+#define vcombine_u32 vcombine_s32 -+ -+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 -+#define vcombine_u64 vcombine_s64 -+ -+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 -+#define vcombine_p8 vcombine_u8 -+ -+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 -+#define vcombine_p16 vcombine_u16 -+ -+//********************************************************************** -+//************************* Splitting vectors ************************** -+//********************************************************************** -+//**************** Get high part ****************************************** -+//These intrinsics split a 128 bit vector into 2 component 64 bit vectors -+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a) -+{ -+ int64x1_t res64; -+ __m128i res; -+ res = _mm_unpackhi_epi64(a,a); //SSE2 -+ return64(res); -+} -+ -+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a) -+{ -+ __m128i res; -+ __m64_128 res64; -+ res = _mm_unpackhi_epi64(_M128i(a),_M128i(a)); -+ return64(res); -+} -+ -+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 -+#define vget_high_u8 vget_high_s8 -+ -+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 -+#define vget_high_u16 vget_high_s16 -+ -+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 -+#define vget_high_u32 vget_high_s32 -+ -+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 -+#define vget_high_u64 vget_high_s64 -+ -+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 -+#define vget_high_p8 vget_high_u8 -+ -+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 -+#define vget_high_p16 vget_high_u16 -+ -+//********************** Get low part ********************** -+//********************************************************** -+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0 -+{ -+ int16x4_t res64; -+ return64(a); -+} -+ -+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0 -+{ -+ int16x4_t res64; -+ return64(a); -+} -+ -+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0 -+{ -+ int32x2_t res64; -+ return64(a); -+} -+ -+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0 -+{ -+ int64x1_t res64; -+ return64 (a); -+} -+ -+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 -+// IA32 SIMD doesn't work with 16bit floats currently -+ -+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 -+_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a) -+{ -+ float32x2_t res64; -+ _M64f(res64, a); -+ return res64; -+} -+ -+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 -+#define vget_low_u8 vget_low_s8 -+ -+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 -+#define vget_low_u16 vget_low_s16 -+ -+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 -+#define vget_low_u32 vget_low_s32 -+ -+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 -+#define vget_low_u64 vget_low_s64 -+ -+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 -+#define vget_low_p8 vget_low_u8 -+ -+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 -+#define vget_low_p16 vget_low_s16 -+ -+//************************************************************************** -+//************************ Converting vectors ********************************** -+//************************************************************************** -+//************* Convert from float *************************************** -+// need to set _MM_SET_ROUNDING_MODE ( x) accordingly -+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 -+_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only -+ return64(res); -+} -+ -+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 -+_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a) -+{ -+ //may be not effective compared with a serial SIMD solution -+ uint32x2_t res64; -+ __m128i res; -+ res = vcvtq_u32_f32(_pM128(a)); -+ return64(res); -+} -+ -+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 -+#define vcvtq_s32_f32 _mm_cvttps_epi32 -+ -+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 -+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0 -+{ -+ //No single instruction SSE solution but we could implement it as following: -+ __m128i resi; -+ __m128 zero, mask, a_pos, mask_f_max_si, res; -+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; -+ zero = _mm_setzero_ps(); -+ mask = _mm_cmpgt_ps(a, zero); -+ a_pos = _mm_and_ps(a, mask); -+ mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff); -+ res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything -+ resi = _mm_cvttps_epi32(res); -+ return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si); -+} -+ -+// ***** Convert to the fixed point with the number of fraction bits specified by b *********** -+//************************************************************************************************* -+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 -+_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b) -+{ -+ int32x2_t res64; -+ return64(vcvtq_n_s32_f32(_pM128(a),b)); -+} -+ -+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 -+_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b) -+{ -+ uint32x2_t res; -+ float convconst; -+ convconst = (float)((uint32_t)1 << b); -+ res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst); -+ res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst); -+ return res; -+} -+ -+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 -+_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; -+ __m128 cconst128; -+ __m128i mask, res; -+ convconst = (float)(1 << b); -+ cconst128 = vdupq_n_f32(convconst); -+ res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128)); -+ mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask); -+ return _mm_xor_si128 (res, mask); //res saturated for 0x80000000 -+} -+ -+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 -+_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ __m128 cconst128; -+ convconst = (float)(1 << b); -+ cconst128 = vdupq_n_f32(convconst); -+ return vcvtq_u32_f32(_mm_mul_ps(a,cconst128)); -+} -+ -+//***************** Convert to float ************************* -+//************************************************************* -+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 -+_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits -+{ -+ float32x2_t res; -+ res.m64_f32[0] = (float) a.m64_i32[0]; -+ res.m64_f32[1] = (float) a.m64_i32[1]; -+ return res; -+} -+ -+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 -+_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = (float) a.m64_u32[0]; -+ res.m64_f32[1] = (float) a.m64_u32[1]; -+ return res; -+} -+ -+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 -+#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a) -+ -+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 -+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0 -+{ -+ //solution may be not optimal -+ __m128 two16, fHi, fLo; -+ __m128i hi, lo; -+ two16 = _mm_set1_ps((float)0x10000); //2^16 -+ // Avoid double rounding by doing two exact conversions -+ // of high and low 16-bit segments -+ hi = _mm_srli_epi32(a, 16); -+ lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16); -+ fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16); -+ fLo = _mm_cvtepi32_ps(lo); -+ // do single rounding according to current rounding mode -+ return _mm_add_ps(fHi, fLo); -+} -+ -+// ***** Convert to the float from fixed point with the number of fraction bits specified by b *********** -+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 -+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b) -+{ -+ float32x2_t res; -+ float convconst; -+ convconst = (float)(1. / ((uint32_t)1 << b)); -+ res.m64_f32[0] = a.m64_i32[0] * convconst; -+ res.m64_f32[1] = a.m64_i32[1] * convconst; -+ return res; -+} -+ -+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 -+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32 -+{ -+ float32x2_t res; -+ float convconst; -+ convconst = (float)(1. / ((uint32_t)1 << b)); -+ res.m64_f32[0] = a.m64_u32[0] * convconst; -+ res.m64_f32[1] = a.m64_u32[1] * convconst; -+ return res; -+} -+ -+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 -+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ __m128 cconst128, af; -+ convconst = (float)(1. / ((uint32_t)1 << b)); -+ af = _mm_cvtepi32_ps(a); -+ cconst128 = vdupq_n_f32(convconst); -+ return _mm_mul_ps(af,cconst128); -+} -+ -+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 -+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b) -+{ -+ float convconst; -+ __m128 cconst128, af; -+ convconst = (float)(1. / (1 << b)); -+ af = vcvtq_f32_u32(a); -+ cconst128 = vdupq_n_f32(convconst); -+ return _mm_mul_ps(af,cconst128); -+} -+ -+//**************Convert between floats *********************** -+//************************************************************ -+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 -+//Intel SIMD doesn't support 16bits floats curently -+ -+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 -+//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits -+ -+//************Vector narrow integer conversion (truncation) ****************** -+//**************************************************************************** -+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 -+_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0 -+{ -+ int8x8_t res64; -+ __m128i res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only -+ return64(res); -+} -+ -+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 -+_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0 -+{ -+ int16x4_t res64; -+ __m128i res; -+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; -+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only -+ return64(res); -+} -+ -+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 -+_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a) -+{ -+ //may be not effective compared with a serial implementation -+ int32x2_t res64; -+ __m128i res; -+ res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0) -+ return64(res); -+} -+ -+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 -+#define vmovn_u16 vmovn_s16 -+ -+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 -+#define vmovn_u32 vmovn_s32 -+ -+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 -+#define vmovn_u64 vmovn_s64 -+ -+//**************** Vector long move *********************** -+//*********************************************************** -+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 -+#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1 -+ -+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 -+#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1 -+ -+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 -+#define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1 -+ -+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 -+#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1 -+ -+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0 -+#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1 -+ -+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 -+#define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1 -+ -+//*************Vector saturating narrow integer***************** -+//************************************************************** -+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 -+_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = _mm_packs_epi16(a, a); -+ return64(res); -+} -+ -+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 -+_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = _mm_packs_epi32(a, a); -+ return64(res); -+} -+ -+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution -+{ -+ int32x2_t res; -+ _NEON2SSE_ALIGN_16 int64_t atmp[2]; -+ _mm_store_si128((__m128i*)atmp, a); -+ if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX; -+ if(atmp[0]SINT_MAX) atmp[1] = SINT_MAX; -+ if(atmp[1]32 bits numbers -+ res_hi = _mm_or_si128(a, mask); -+ res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(res_hi); -+} -+//************* Vector saturating narrow integer signed->unsigned ************** -+//***************************************************************************** -+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 -+_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a) -+{ -+ uint8x8_t res64; -+ __m128i res; -+ res = _mm_packus_epi16(a, a); //use low 64bits only -+ return64(res); -+} -+ -+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 -+_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a) -+{ -+ uint16x4_t res64; -+ __m128i res; -+ res = _MM_PACKUS1_EPI32(a); //use low 64bits only -+ return64(res); -+} -+ -+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 -+_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a) -+{ -+ uint32x2_t res64; -+ __m128i res_hi,res_lo, zero, cmp; -+ zero = _mm_setzero_si128(); -+ res_hi = _mm_srli_epi64(a, 32); -+ cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero -+ res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 -+ cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive -+ res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff -+ res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits -+ return64(res_lo); -+} -+ -+// ******************************************************** -+// **************** Table look up ************************** -+// ******************************************************** -+//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values -+//in a table and generate a new vector. Indexes out of range return 0. -+//for Intel SIMD we need to set the MSB to 1 for zero return -+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ __m128i c7, maskgt, bmask, b128; -+ c7 = _mm_set1_epi8 (7); -+ b128 = _pM128i(b); -+ maskgt = _mm_cmpgt_epi8(b128,c7); -+ bmask = _mm_or_si128(b128,maskgt); -+ bmask = _mm_shuffle_epi8(_pM128i(a),bmask); -+ return64(bmask); -+} -+ -+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 -+#define vtbl1_s8 vtbl1_u8 -+ -+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 -+#define vtbl1_p8 vtbl1_u8 -+ -+//Special trick to avoid __declspec(align('8')) won't be aligned" error -+//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b) -+{ -+ uint8x8_t res64; -+ __m128i c15, a01, maskgt15, bmask, b128; -+ c15 = _mm_set1_epi8 (15); -+ b128 = _pM128i(b); -+ maskgt15 = _mm_cmpgt_epi8(b128,c15); -+ bmask = _mm_or_si128(b128, maskgt15); -+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1])); -+ a01 = _mm_shuffle_epi8(a01, bmask); -+ return64(a01); -+} -+#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b) -+ -+//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+#define vtbl2_s8 vtbl2_u8 -+ -+//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 -+#define vtbl2_p8 vtbl2_u8 -+ -+//Special trick to avoid __declspec(align('16')) won't be aligned" error -+//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128; -+ c15 = _mm_set1_epi8 (15); -+ c23 = _mm_set1_epi8 (23); -+ b128 = _pM128i(b); -+ maskgt23 = _mm_cmpgt_epi8(b128,c23); -+ bmask = _mm_or_si128(b128, maskgt23); -+ maskgt15 = _mm_cmpgt_epi8(b128,c15); -+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); -+ sh0 = _mm_shuffle_epi8(a01, bmask); -+ sh1 = _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1 -+ return64(sh0); -+} -+#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b) -+ -+//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+#define vtbl3_s8 vtbl3_u8 -+ -+//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 -+#define vtbl3_p8 vtbl3_u8 -+ -+//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128; -+ c15 = _mm_set1_epi8 (15); -+ c31 = _mm_set1_epi8 (31); -+ b128 = _pM128i(b); -+ maskgt31 = _mm_cmpgt_epi8(b128,c31); -+ bmask = _mm_or_si128(b128, maskgt31); -+ maskgt15 = _mm_cmpgt_epi8(b128,c15); -+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1])); -+ a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3])); -+ sh0 = _mm_shuffle_epi8(a01, bmask); -+ sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1 -+ return64(sh0); -+} -+#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b) -+ -+//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+#define vtbl4_s8 vtbl4_u8 -+ -+//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 -+#define vtbl4_p8 vtbl4_u8 -+ -+//****************** Extended table look up intrinsics *************************** -+//********************************************************************************** -+//VTBX (Vector Table Extension) works in the same way as VTBL do, -+// except that indexes out of range leave the destination element unchanged. -+ -+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) -+{ -+ uint8x8_t res64; -+ __m128i c7, maskgt, sh, c128; -+ c7 = _mm_set1_epi8 (7); -+ c128 = _pM128i(c); -+ maskgt = _mm_cmpgt_epi8(c128,c7); -+ c7 = _mm_and_si128(maskgt,_pM128i(a)); -+ sh = _mm_shuffle_epi8(_pM128i(b),c128); -+ sh = _mm_andnot_si128(maskgt,sh); -+ sh = _mm_or_si128(sh,c7); -+ return64(sh); -+} -+ -+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 -+#define vtbx1_s8 vtbx1_u8 -+ -+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 -+#define vtbx1_p8 vtbx1_u8 -+ -+//Special trick to avoid __declspec(align('8')) won't be aligned" error -+//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c) -+{ -+ uint8x8_t res64; -+ __m128i c15, b01, maskgt15, sh, c128; -+ c15 = _mm_set1_epi8 (15); -+ c128 = _pM128i(c); -+ maskgt15 = _mm_cmpgt_epi8(c128, c15); -+ c15 = _mm_and_si128(maskgt15, _pM128i(a)); -+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1])); -+ sh = _mm_shuffle_epi8(b01, c128); -+ sh = _mm_andnot_si128(maskgt15, sh); -+ sh = _mm_or_si128(sh,c15); -+ return64(sh); -+} -+#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c) -+ -+//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+#define vtbx2_s8 vtbx2_u8 -+ -+//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 -+#define vtbx2_p8 vtbx2_u8 -+ -+//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128; -+ c15 = _mm_set1_epi8 (15); -+ c23 = _mm_set1_epi8 (23); -+ c128 = _pM128i(c); -+ maskgt15 = _mm_cmpgt_epi8(c128,c15); -+ maskgt23 = _mm_cmpgt_epi8(c128,c23); -+ c23 = _mm_and_si128(maskgt23, _pM128i(a)); -+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); -+ sh0 = _mm_shuffle_epi8(b01, c128); -+ sh1 = _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); -+ sh0 = _mm_andnot_si128(maskgt23,sh0); -+ sh0 = _mm_or_si128(sh0,c23); -+ return64(sh0); -+} -+#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c) -+ -+//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c); -+#define vtbx3_s8 vtbx3_u8 -+ -+//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 -+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c); -+#define vtbx3_p8 vtbx3_u8 -+ -+//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c) -+{ -+ //solution may be not optimal -+ uint8x8_t res64; -+ __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128; -+ c15 = _mm_set1_epi8 (15); -+ c31 = _mm_set1_epi8 (31); -+ c128 = _pM128i(c); -+ maskgt15 = _mm_cmpgt_epi8(c128,c15); -+ maskgt31 = _mm_cmpgt_epi8(c128,c31); -+ c31 = _mm_and_si128(maskgt31, _pM128i(a)); -+ -+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1])); -+ b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3])); -+ sh0 = _mm_shuffle_epi8(b01, c128); -+ sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15) -+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); -+ sh0 = _mm_andnot_si128(maskgt31,sh0); -+ sh0 = _mm_or_si128(sh0,c31); -+ return64(sh0); -+} -+#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c) -+ -+//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c); -+#define vtbx4_s8 vtbx4_u8 -+ -+//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 -+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c); -+#define vtbx4_p8 vtbx4_u8 -+ -+//************************************************************************************************* -+// *************************** Operations with a scalar value ********************************* -+//************************************************************************************************* -+ -+//******* Vector multiply accumulate by scalar ************************************************* -+//********************************************************************************************** -+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0] -+{ -+ int16_t c; -+ int16x4_t scalar; -+ c = vget_lane_s16(v, l); -+ scalar = vdup_n_s16(c); -+ return vmla_s16(a, b, scalar); -+} -+ -+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0] -+{ -+ int32_t c; -+ int32x2_t scalar; -+ c = vget_lane_s32(v, l); -+ scalar = vdup_n_s32(c); -+ return vmla_s32(a, b, scalar); -+} -+ -+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] -+#define vmla_lane_u16 vmla_lane_s16 -+ -+ -+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] -+#define vmla_lane_u32 vmla_lane_s32 -+ -+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) -+{ -+ float32_t vlane; -+ float32x2_t c; -+ vlane = vget_lane_f32(v, l); -+ c = vdup_n_f32(vlane); -+ return vmla_f32(a,b,c); -+} -+ -+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] -+{ -+ int16_t vlane; -+ int16x8_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdupq_n_s16(vlane); -+ return vmlaq_s16(a,b,c); -+} -+ -+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] -+{ -+ int32_t vlane; -+ int32x4_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdupq_n_s32(vlane); -+ return vmlaq_s32(a,b,c); -+} -+ -+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] -+#define vmlaq_lane_u16 vmlaq_lane_s16 -+ -+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] -+#define vmlaq_lane_u32 vmlaq_lane_s32 -+ -+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] -+{ -+ float32_t vlane; -+ float32x4_t c; -+ vlane = vget_lane_f32(v, l); -+ c = vdupq_n_f32(vlane); -+ return vmlaq_f32(a,b,c); -+} -+ -+//***************** Vector widening multiply accumulate by scalar ********************** -+//*************************************************************************************** -+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmlal_s16(a, b, c); -+} -+ -+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vmlal_s32(a, b, c); -+} -+ -+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t c; -+ vlane = vget_lane_u16(v, l); -+ c = vdup_n_u16(vlane); -+ return vmlal_u16(a, b, c); -+} -+ -+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdup_n_u32(vlane); -+ return vmlal_u32(a, b, c); -+} -+ -+// ******** Vector widening saturating doubling multiply accumulate by scalar ******************************* -+// ************************************************************************************************ -+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vqdmlal_s16(a, b, c); -+} -+ -+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) -+{ -+ int32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vqdmlal_s32(a, b, c); -+} -+ -+// ****** Vector multiply subtract by scalar ***************** -+// ************************************************************* -+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmls_s16(a, b, c); -+} -+ -+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vmls_s32(a, b, c); -+} -+ -+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmls_s16(a, b, c); -+} -+ -+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdup_n_u32(vlane); -+ return vmls_u32(a, b, c); -+} -+ -+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) -+{ -+ float32_t vlane; -+ float32x2_t c; -+ vlane = (float) vget_lane_f32(v, l); -+ c = vdup_n_f32(vlane); -+ return vmls_f32(a,b,c); -+} -+ -+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0] -+{ -+ int16_t vlane; -+ int16x8_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdupq_n_s16(vlane); -+ return vmlsq_s16(a, b,c); -+} -+ -+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0] -+{ -+ int32_t vlane; -+ int32x4_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdupq_n_s32(vlane); -+ return vmlsq_s32(a,b,c); -+} -+ -+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x8_t c; -+ vlane = vget_lane_u16(v, l); -+ c = vdupq_n_u16(vlane); -+ return vmlsq_u16(a,b,c); -+} -+ -+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x4_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdupq_n_u32(vlane); -+ return vmlsq_u32(a,b,c); -+} -+ -+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] -+{ -+ float32_t vlane; -+ float32x4_t c; -+ vlane = (float) vget_lane_f32(v, l); -+ c = vdupq_n_f32(vlane); -+ return vmlsq_f32(a,b,c); -+} -+ -+// **** Vector widening multiply subtract by scalar **** -+// **************************************************** -+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmlsl_s16(a, b, c); -+} -+ -+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vmlsl_s32(a, b, c); -+} -+ -+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vmlsl_s16(a, b, c); -+} -+ -+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t c; -+ vlane = vget_lane_u32(v, l); -+ c = vdup_n_u32(vlane); -+ return vmlsl_u32(a, b, c); -+} -+ -+//********* Vector widening saturating doubling multiply subtract by scalar ************************** -+//****************************************************************************************************** -+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) -+{ -+ int16_t vlane; -+ int16x4_t c; -+ vlane = vget_lane_s16(v, l); -+ c = vdup_n_s16(vlane); -+ return vqdmlsl_s16(a, b, c); -+} -+ -+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32_t vlane; -+ int32x2_t c; -+ vlane = vget_lane_s32(v, l); -+ c = vdup_n_s32(vlane); -+ return vqdmlsl_s32(a, b, c); -+} -+//********** Vector multiply with scalar ***************************** -+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0] -+{ -+ int16x4_t b16x4; -+ b16x4 = vdup_n_s16(b); -+ return vmul_s16(a, b16x4); -+} -+ -+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] -+_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0] -+{ -+ //serial solution looks faster -+ int32x2_t b32x2; -+ b32x2 = vdup_n_s32(b); -+ return vmul_s32(a, b32x2); -+} -+ -+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] -+_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0] -+{ -+ float32x2_t b32x2; -+ b32x2 = vdup_n_f32(b); -+ return vmul_f32(a, b32x2); -+} -+ -+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] -+_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0] -+{ -+ uint16x4_t b16x4; -+ b16x4 = vdup_n_s16(b); -+ return vmul_s16(a, b16x4); -+} -+ -+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] -+_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0] -+{ -+ //serial solution looks faster -+ uint32x2_t b32x2; -+ b32x2 = vdup_n_u32(b); -+ return vmul_u32(a, b32x2); -+} -+ -+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0] -+{ -+ int16x8_t b16x8; -+ b16x8 = vdupq_n_s16(b); -+ return vmulq_s16(a, b16x8); -+} -+ -+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] -+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0] -+{ -+ int32x4_t b32x4; -+ b32x4 = vdupq_n_s32(b); -+ return vmulq_s32(a, b32x4); -+} -+ -+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] -+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0] -+{ -+ float32x4_t b32x4; -+ b32x4 = vdupq_n_f32(b); -+ return vmulq_f32(a, b32x4); -+} -+ -+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] -+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0] -+{ -+ uint16x8_t b16x8; -+ b16x8 = vdupq_n_s16(b); -+ return vmulq_s16(a, b16x8); -+} -+ -+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] -+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0] -+{ -+ uint32x4_t b32x4; -+ b32x4 = vdupq_n_u32(b); -+ return vmulq_u32(a, b32x4); -+} -+ -+//********** Vector multiply lane ***************************** -+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); -+_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c) -+{ -+ int16x4_t b16x4; -+ int16_t vlane; -+ vlane = vget_lane_s16(b, c); -+ b16x4 = vdup_n_s16(vlane); -+ return vmul_s16(a, b16x4); -+} -+ -+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c) -+{ -+ int32x2_t b32x2; -+ int32_t vlane; -+ vlane = vget_lane_s32(b, c); -+ b32x2 = vdup_n_s32(vlane); -+ return vmul_s32(a, b32x2); -+} -+ -+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c) -+{ -+ float32x2_t b32x2; -+ float32_t vlane; -+ vlane = vget_lane_f32(b, c); -+ b32x2 = vdup_n_f32(vlane); -+ return vmul_f32(a, b32x2); -+} -+ -+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); -+#define vmul_lane_u16 vmul_lane_s16 -+ -+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); -+#define vmul_lane_u32 vmul_lane_s32 -+ -+int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c); -+_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c) -+{ -+ int16x8_t b16x8; -+ int16_t vlane; -+ vlane = vget_lane_s16(b, c); -+ b16x8 = vdupq_n_s16(vlane); -+ return vmulq_s16(a, b16x8); -+} -+ -+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c) -+{ -+ int32x4_t b32x4; -+ int32_t vlane; -+ vlane = vget_lane_s32(b, c); -+ b32x4 = vdupq_n_s32(vlane); -+ return vmulq_s32(a, b32x4); -+} -+ -+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); -+_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c) -+{ -+ float32x4_t b32x4; -+ float32_t vlane; -+ vlane = vget_lane_f32(b, c); -+ b32x4 = vdupq_n_f32(vlane); -+ return vmulq_f32(a, b32x4); -+} -+ -+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); -+#define vmulq_lane_u16 vmulq_lane_s16 -+ -+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); -+#define vmulq_lane_u32 vmulq_lane_s32 -+ -+//**** Vector long multiply with scalar ************ -+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0] -+{ -+ int16x4_t b16x4; -+ b16x4 = vdup_n_s16(val2); -+ return vmull_s16(vec1, b16x4); -+} -+ -+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0] -+{ -+ int32x2_t b32x2; -+ b32x2 = vdup_n_s32(val2); -+ return vmull_s32(vec1, b32x2); -+} -+ -+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0] -+_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0] -+{ -+ uint16x4_t b16x4; -+ b16x4 = vdup_n_s16(val2); -+ return vmull_s16(vec1, b16x4); -+} -+ -+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] -+_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0] -+{ -+ uint32x2_t b32x2; -+ b32x2 = vdup_n_u32(val2); -+ return vmull_u32(vec1, b32x2); -+} -+ -+//**** Vector long multiply by scalar **** -+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0] -+{ -+ int16_t vlane; -+ int16x4_t b; -+ vlane = vget_lane_s16(val2, val3); -+ b = vdup_n_s16(vlane); -+ return vmull_s16(vec1, b); -+} -+ -+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0] -+{ -+ int32_t vlane; -+ int32x2_t b; -+ vlane = vget_lane_s32(val2, val3); -+ b = vdup_n_s32(vlane); -+ return vmull_s32(vec1, b); -+} -+ -+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0] -+_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0] -+{ -+ uint16_t vlane; -+ uint16x4_t b; -+ vlane = vget_lane_s16(val2, val3); -+ b = vdup_n_s16(vlane); -+ return vmull_s16(vec1, b); -+} -+ -+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] -+_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0] -+{ -+ uint32_t vlane; -+ uint32x2_t b; -+ vlane = vget_lane_u32(val2, val3); -+ b = vdup_n_u32(vlane); -+ return vmull_u32(vec1, b); -+} -+ -+//********* Vector saturating doubling long multiply with scalar ******************* -+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2) -+{ -+ //the serial soulution may be faster due to saturation -+ int16x4_t b; -+ b = vdup_n_s16(val2); -+ return vqdmull_s16(vec1, b); -+} -+ -+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t b; -+ b = vdup_n_s32(val2); -+ return vqdmull_s32(vec1,b); //slow serial function!!!! -+} -+ -+//************* Vector saturating doubling long multiply by scalar *********************************************** -+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) -+{ -+ int16_t c; -+ int16x4_t scalar; -+ c = vget_lane_s16(val2, val3); -+ scalar = vdup_n_s16(c); -+ return vqdmull_s16(vec1, scalar); -+} -+ -+ -+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32_t c; -+ int32x2_t scalar; -+ c = vget_lane_s32(val2, val3); -+ scalar = vdup_n_s32(c); -+ return vqdmull_s32(vec1,scalar); //slow serial function!!!! -+} -+ -+// *****Vector saturating doubling multiply high with scalar ***** -+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2) -+{ -+ int16x4_t res64; -+ return64(vqdmulhq_n_s16(_pM128i(vec1), val2)); -+} -+ -+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2) -+{ -+ int32x2_t res64; -+ return64(vqdmulhq_n_s32(_pM128i(vec1), val2)); -+} -+ -+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16x8_t scalar; -+ scalar = vdupq_n_s16(val2); -+ return vqdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32x4_t scalar; -+ scalar = vdupq_n_s32(val2); -+ return vqdmulhq_s32(vec1, scalar); -+} -+ -+//***** Vector saturating doubling multiply high by scalar **************** -+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x4_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdup_n_s16(vlane); -+ return vqdmulh_s16(vec1, scalar); -+} -+ -+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32_t vlane; -+ int32x2_t scalar; -+ vlane = vget_lane_s32(val2, val3); -+ scalar = vdup_n_s32(vlane); -+ return vqdmulh_s32(vec1, scalar); -+} -+ -+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x8_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdupq_n_s16(vlane ); -+ return vqdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //solution may be not optimal -+ int32_t vlane; -+ int32x4_t scalar; -+ vlane = vgetq_lane_s32(_pM128i(val2), val3); -+ scalar = vdupq_n_s32(vlane ); -+ return vqdmulhq_s32(vec1, scalar); -+} -+ -+//******** Vector saturating rounding doubling multiply high with scalar *** -+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0] -+{ -+ //solution may be not optimal -+ int16x4_t scalar; -+ scalar = vdup_n_s16(val2); -+ return vqrdmulh_s16(vec1, scalar); -+} -+ -+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32x2_t scalar; -+ scalar = vdup_n_s32(val2); -+ return vqrdmulh_s32(vec1, scalar); -+} -+ -+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16x8_t scalar; -+ scalar = vdupq_n_s16(val2); -+ return vqrdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32x4_t scalar; -+ scalar = vdupq_n_s32(val2); -+ return vqrdmulhq_s32(vec1, scalar); -+} -+ -+//********* Vector rounding saturating doubling multiply high by scalar **** -+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] -+_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x4_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdup_n_s16(vlane); -+ return vqrdmulh_s16(vec1, scalar); -+} -+ -+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ int32_t vlane; -+ int32x2_t scalar; -+ vlane = vget_lane_s32(val2, val3); -+ scalar = vdup_n_s32(vlane); -+ return vqrdmulh_s32(vec1, scalar); -+} -+ -+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] -+_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0] -+{ -+ //solution may be not optimal -+ int16_t vlane; -+ int16x8_t scalar; -+ vlane = vget_lane_s16(val2, val3); -+ scalar = vdupq_n_s16(vlane); -+ return vqrdmulhq_s16(vec1, scalar); -+} -+ -+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) -+{ -+ //solution may be not optimal -+ int32_t vlane; -+ int32x4_t scalar; -+ vlane = vgetq_lane_s32(_pM128i(val2), val3); -+ scalar = vdupq_n_s32(vlane ); -+ return vqrdmulhq_s32(vec1, scalar); -+} -+ -+//**************Vector multiply accumulate with scalar ******************* -+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0] -+{ -+ int16x4_t scalar; -+ scalar = vdup_n_s16(c); -+ return vmla_s16(a, b, scalar); -+} -+ -+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0] -+{ -+ int32x2_t scalar; -+ scalar = vdup_n_s32(c); -+ return vmla_s32(a, b, scalar); -+} -+ -+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] -+#define vmla_n_u16 vmla_n_s16 -+ -+ -+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] -+#define vmla_n_u32 vmla_n_s32 -+ -+ -+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0] -+{ -+ float32x2_t scalar; -+ scalar = vdup_n_f32(c); -+ return vmla_f32(a, b, scalar); -+} -+ -+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0] -+{ -+ int16x8_t scalar; -+ scalar = vdupq_n_s16(c); -+ return vmlaq_s16(a,b,scalar); -+} -+ -+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0] -+{ -+ int32x4_t scalar; -+ scalar = vdupq_n_s32(c); -+ return vmlaq_s32(a,b,scalar); -+} -+ -+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] -+#define vmlaq_n_u16 vmlaq_n_s16 -+ -+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] -+#define vmlaq_n_u32 vmlaq_n_s32 -+ -+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0] -+{ -+ float32x4_t scalar; -+ scalar = vdupq_n_f32(c); -+ return vmlaq_f32(a,b,scalar); -+} -+ -+//************Vector widening multiply accumulate with scalar**************************** -+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0] -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmlal_s16(a, b, vc); -+} -+ -+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0] -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vmlal_s32(a, b, vc); -+} -+ -+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0] -+{ -+ uint16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmlal_s16(a, b, vc); -+} -+ -+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0] -+{ -+ uint32x2_t vc; -+ vc = vdup_n_u32(c); -+ return vmlal_u32(a, b, vc); -+} -+ -+//************ Vector widening saturating doubling multiply accumulate with scalar ************** -+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) -+{ -+ //not optimal SIMD soulution, serial may be faster -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vqdmlal_s16(a, b, vc); -+} -+ -+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vqdmlal_s32(a, b, vc); -+} -+ -+//******** Vector multiply subtract with scalar ************** -+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0] -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmls_s16(a, b, vc); -+} -+ -+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0] -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vmls_s32(a, b, vc); -+} -+ -+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] -+_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0] -+{ -+ uint16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmls_s16(a, b, vc); -+} -+ -+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] -+_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0] -+{ -+ uint32x2_t vc; -+ vc = vdup_n_u32(c); -+ return vmls_u32(a, b, vc); -+} -+ -+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] -+_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) -+{ -+ float32x2_t res; -+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c; -+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c; -+ return res; -+} -+ -+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0] -+{ -+ int16x8_t vc; -+ vc = vdupq_n_s16(c); -+ return vmlsq_s16(a, b,vc); -+} -+ -+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0] -+{ -+ int32x4_t vc; -+ vc = vdupq_n_s32(c); -+ return vmlsq_s32(a,b,vc); -+} -+ -+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] -+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0] -+{ -+ uint32x4_t vc; -+ vc = vdupq_n_u32(c); -+ return vmlsq_u32(a,b,vc); -+} -+ -+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0] -+{ -+ uint32x4_t vc; -+ vc = vdupq_n_u32(c); -+ return vmlsq_u32(a,b,vc); -+} -+ -+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] -+_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) -+{ -+ float32x4_t vc; -+ vc = vdupq_n_f32(c); -+ return vmlsq_f32(a,b,vc); -+} -+ -+//**** Vector widening multiply subtract with scalar ****** -+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0] -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vmlsl_s16(a, b, vc); -+} -+ -+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0] -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vmlsl_s32(a, b, vc); -+} -+ -+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0] -+_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0] -+{ -+ uint16x4_t vc; -+ vc = vdup_n_u16(c); -+ return vmlsl_u16(a, b, vc); -+} -+ -+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] -+_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0] -+{ -+ uint32x2_t vc; -+ vc = vdup_n_u32(c); -+ return vmlsl_u32(a, b, vc); -+} -+ -+//***** Vector widening saturating doubling multiply subtract with scalar ********* -+//********************************************************************************** -+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] -+_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) -+{ -+ int16x4_t vc; -+ vc = vdup_n_s16(c); -+ return vqdmlsl_s16(a, b, vc); -+} -+ -+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int32x2_t vc; -+ vc = vdup_n_s32(c); -+ return vqdmlsl_s32(a, b, vc); -+} -+ -+//******************* Vector extract *********************************************** -+//************************************************************************************* -+//VEXT (Vector Extract) extracts elements from the bottom end of the second operand -+//vector and the top end of the first, concatenates them, and places the result in the destination vector -+//c elements from the bottom end of the second operand and (8-c) from the top end of the first -+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 -+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL) -+{ -+ int8x8_t res; -+ int i; -+ for (i = 0; i<8 - c; i++) { -+ res.m64_i8[i] = a.m64_i8[i + c]; -+ } -+ for(i = 0; i> 1); -+ tmp = _mm_srli_epi32(res, 2); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2); -+ tmp = _mm_srli_epi32(res, 4); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4); -+ tmp = _mm_srli_epi32(res, 8); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8); -+ tmp = _mm_srli_epi32(res, 16); -+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16); -+ -+ tmp = _mm_srli_epi32(res, 1); -+ tmp = _mm_and_si128(tmp, c55555555); -+ res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555); -+ -+ tmp = _mm_srli_epi32(res, 2); -+ tmp = _mm_and_si128(tmp, c33333333); -+ tmp1 = _mm_and_si128(res, c33333333); -+ res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333)); -+ -+ tmp = _mm_srli_epi32(res, 4); -+ tmp = _mm_add_epi32(tmp, res); -+ res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f); -+ -+ tmp = _mm_srli_epi32(res, 8); -+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8); -+ -+ tmp = _mm_srli_epi32(res, 16); -+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16); -+ -+ res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f; -+ -+ return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i]; -+} -+ -+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 -+#define vclzq_u8 vclzq_s8 -+ -+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 -+#define vclzq_u16 vclzq_s16 -+ -+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 -+#define vclzq_u32 vclzq_s32 -+ -+//************** Count leading sign bits ************************** -+//******************************************************************** -+//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following -+// the topmost bit, that are the same as the topmost bit, in each element in a vector -+//No corresponding vector intrinsics in IA32, need to implement it. -+//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits -+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 -+_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = vclsq_s8(_pM128i(a)); -+ return64(res); -+} -+ -+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 -+_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = vclsq_s16(_pM128i(a)); -+ return64(res); -+} -+ -+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 -+_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = vclsq_s32(_pM128i(a)); -+ return64(res); -+} -+ -+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 -+_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a) -+{ -+ __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb; -+ cff = _mm_cmpeq_epi8 (a,a); //0xff -+ c80 = _mm_set1_epi8(0x80); -+ c1 = _mm_set1_epi8(1); -+ a_mask = _mm_and_si128(a, c80); -+ a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive -+ a_neg = _mm_xor_si128(a, cff); -+ a_neg = _mm_and_si128(a_mask, a_neg); -+ a_pos = _mm_andnot_si128(a_mask, a); -+ a_comb = _mm_or_si128(a_pos, a_neg); -+ a_comb = vclzq_s8(a_comb); -+ return _mm_sub_epi8(a_comb, c1); -+} -+ -+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 -+_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a) -+{ -+ __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb; -+ cffff = _mm_cmpeq_epi16(a,a); -+ c8000 = _mm_slli_epi16(cffff, 15); //0x8000 -+ c1 = _mm_srli_epi16(cffff,15); //0x1 -+ a_mask = _mm_and_si128(a, c8000); -+ a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive -+ a_neg = _mm_xor_si128(a, cffff); -+ a_neg = _mm_and_si128(a_mask, a_neg); -+ a_pos = _mm_andnot_si128(a_mask, a); -+ a_comb = _mm_or_si128(a_pos, a_neg); -+ a_comb = vclzq_s16(a_comb); -+ return _mm_sub_epi16(a_comb, c1); -+} -+ -+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 -+_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a) -+{ -+ __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb; -+ cffffffff = _mm_cmpeq_epi32(a,a); -+ c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000 -+ c1 = _mm_srli_epi32(cffffffff,31); //0x1 -+ a_mask = _mm_and_si128(a, c80000000); -+ a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive -+ a_neg = _mm_xor_si128(a, cffffffff); -+ a_neg = _mm_and_si128(a_mask, a_neg); -+ a_pos = _mm_andnot_si128(a_mask, a); -+ a_comb = _mm_or_si128(a_pos, a_neg); -+ a_comb = vclzq_s32(a_comb); -+ return _mm_sub_epi32(a_comb, c1); -+} -+ -+//************************* Count number of set bits ******************************** -+//************************************************************************************* -+//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element -+//another option is to do the following algorithm: -+ -+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 -+_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a) -+{ -+ uint8x8_t res64; -+ __m128i res; -+ res = vcntq_u8(_pM128i(a)); -+ return64(res); -+} -+ -+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 -+#define vcnt_s8 vcnt_u8 -+ -+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 -+#define vcnt_p8 vcnt_u8 -+ -+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 -+_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a) -+{ -+ _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2, -+ /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3, -+ /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3, -+ /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 }; -+ __m128i maskLOW, mask, lowpopcnt, hipopcnt; -+ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set -+ mask = _mm_and_si128(a, maskLOW); -+ lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway -+ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits -+ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set -+ hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway -+ return _mm_add_epi8(lowpopcnt, hipopcnt); -+} -+ -+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 -+#define vcntq_s8 vcntq_u8 -+ -+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 -+#define vcntq_p8 vcntq_u8 -+ -+//************************************************************************************** -+//*********************** Logical operations **************************************** -+//************************************************************************************** -+//************************** Bitwise not *********************************** -+//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance -+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 -+_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = vmvnq_s8(_pM128i(a)); -+ return64(res); -+} -+ -+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 -+_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a) -+{ -+ int16x4_t res64; -+ __m128i res; -+ res = vmvnq_s16(_pM128i(a)); -+ return64(res); -+} -+ -+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 -+_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a) -+{ -+ int32x2_t res64; -+ __m128i res; -+ res = vmvnq_s32(_pM128i(a)); -+ return64(res); -+} -+ -+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 -+#define vmvn_u8 vmvn_s8 -+ -+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 -+#define vmvn_u16 vmvn_s16 -+ -+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 -+#define vmvn_u32 vmvn_s32 -+ -+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 -+#define vmvn_p8 vmvn_u8 -+ -+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 -+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0 -+{ -+ __m128i c1; -+ c1 = _mm_cmpeq_epi8 (a,a); //0xff -+ return _mm_andnot_si128 (a, c1); -+} -+ -+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 -+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0 -+{ -+ __m128i c1; -+ c1 = _mm_cmpeq_epi16 (a,a); //0xffff -+ return _mm_andnot_si128 (a, c1); -+} -+ -+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 -+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0 -+{ -+ __m128i c1; -+ c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff -+ return _mm_andnot_si128 (a, c1); -+} -+ -+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 -+#define vmvnq_u8 vmvnq_s8 -+ -+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 -+#define vmvnq_u16 vmvnq_s16 -+ -+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 -+#define vmvnq_u32 vmvnq_s32 -+ -+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 -+#define vmvnq_p8 vmvnq_u8 -+ -+//****************** Bitwise and *********************** -+//****************************************************** -+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); -+} -+ -+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); -+} -+ -+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_and_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0]; -+ return res; -+} -+ -+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 -+#define vand_u8 vand_s8 -+ -+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 -+#define vand_u16 vand_s16 -+ -+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 -+#define vand_u32 vand_s32 -+ -+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 -+#define vand_u64 vand_s64 -+ -+ -+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 -+#define vandq_s8 _mm_and_si128 -+ -+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 -+#define vandq_s16 _mm_and_si128 -+ -+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 -+#define vandq_s32 _mm_and_si128 -+ -+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 -+#define vandq_s64 _mm_and_si128 -+ -+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 -+#define vandq_u8 _mm_and_si128 -+ -+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 -+#define vandq_u16 _mm_and_si128 -+ -+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 -+#define vandq_u32 _mm_and_si128 -+ -+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 -+#define vandq_u64 _mm_and_si128 -+ -+//******************** Bitwise or ********************************* -+//****************************************************************** -+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(_mm_or_si128(_pM128i(a),_pM128i(b))); -+} -+ -+ -+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0]; -+ return res; -+} -+ -+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 -+#define vorr_u8 vorr_s8 -+ -+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 -+#define vorr_u16 vorr_s16 -+ -+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 -+#define vorr_u32 vorr_s32 -+ -+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 -+#define vorr_u64 vorr_s64 -+ -+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 -+#define vorrq_s8 _mm_or_si128 -+ -+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 -+#define vorrq_s16 _mm_or_si128 -+ -+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 -+#define vorrq_s32 _mm_or_si128 -+ -+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 -+#define vorrq_s64 _mm_or_si128 -+ -+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 -+#define vorrq_u8 _mm_or_si128 -+ -+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 -+#define vorrq_u16 _mm_or_si128 -+ -+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 -+#define vorrq_u32 _mm_or_si128 -+ -+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 -+#define vorrq_u64 _mm_or_si128 -+ -+//************* Bitwise exclusive or (EOR or XOR) ****************** -+//******************************************************************* -+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_xor_si128(_pM128i(a),_pM128i(b))); -+} -+ -+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 -+#define veor_s16 veor_s8 -+ -+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 -+#define veor_s32 veor_s8 -+ -+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0]; -+ return res; -+} -+ -+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 -+#define veor_u8 veor_s8 -+ -+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 -+#define veor_u16 veor_s16 -+ -+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 -+#define veor_u32 veor_s32 -+ -+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 -+#define veor_u64 veor_s64 -+ -+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 -+#define veorq_s8 _mm_xor_si128 -+ -+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 -+#define veorq_s16 _mm_xor_si128 -+ -+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 -+#define veorq_s32 _mm_xor_si128 -+ -+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 -+#define veorq_s64 _mm_xor_si128 -+ -+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 -+#define veorq_u8 _mm_xor_si128 -+ -+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 -+#define veorq_u16 _mm_xor_si128 -+ -+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 -+#define veorq_u32 _mm_xor_si128 -+ -+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 -+#define veorq_u64 _mm_xor_si128 -+ -+//********************** Bit Clear ********************************** -+//******************************************************************* -+//Logical AND complement (AND negation or AND NOT) -+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap" -+} -+ -+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 -+#define vbic_s16 vbic_s8 -+ -+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 -+#define vbic_s32 vbic_s8 -+ -+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]); -+ return res; -+} -+ -+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 -+#define vbic_u8 vbic_s8 -+ -+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 -+#define vbic_u16 vbic_s16 -+ -+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 -+#define vbic_u32 vbic_s32 -+ -+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 -+#define vbic_u64 vbic_s64 -+ -+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 -+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 -+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 -+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 -+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 -+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 -+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 -+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 -+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" -+ -+//**************** Bitwise OR complement ******************************** -+//**************************************** ******************************** -+//no exact IA 32 match, need to implement it as following -+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b) -+{ -+ int8x8_t res64; -+ return64(vornq_s8(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b) -+{ -+ int16x4_t res64; -+ return64(vornq_s16(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2_t res64; -+ return64(vornq_s32(_pM128i(a), _pM128i(b))); -+} -+ -+ -+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]); -+ return res; -+} -+ -+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 -+#define vorn_u8 vorn_s8 -+ -+ -+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 -+#define vorn_u16 vorn_s16 -+ -+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 -+#define vorn_u32 vorn_s32 -+ -+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 -+#define vorn_u64 vorn_s64 -+ -+ -+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s8( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s16( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s32( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b) -+{ -+ __m128i c1, b1; -+ c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff -+ b1 = _mm_andnot_si128 (b, c1); -+ return _mm_or_si128 (a, b1); -+} -+ -+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_u8( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_s16( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+ -+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 -+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0 -+{ -+ __m128i b1; -+ b1 = vmvnq_u32( b); //bitwise not for b -+ return _mm_or_si128 (a, b1); -+} -+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 -+#define vornq_u64 vornq_s64 -+ -+//********************* Bitwise Select ***************************** -+//****************************************************************** -+//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????) -+ -+//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the -+//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0. -+ -+//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination -+//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged -+ -+//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination -+//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged. -+ -+//VBSL only is implemented for SIMD -+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 -+_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) -+{ -+ int8x8_t res64; -+ __m128i res; -+ res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c)); -+ return64(res); -+} -+ -+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 -+#define vbsl_s16 vbsl_s8 -+ -+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 -+#define vbsl_s32 vbsl_s8 -+ -+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 -+_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) -+{ -+ int64x1_t res; -+ res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]); -+ return res; -+} -+ -+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 -+#define vbsl_u8 vbsl_s8 -+ -+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 -+#define vbsl_u16 vbsl_s8 -+ -+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 -+#define vbsl_u32 vbsl_s8 -+ -+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 -+#define vbsl_u64 vbsl_s64 -+ -+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 -+_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) -+{ -+ __m128 sel1, sel2; -+ __m64_128 res64; -+ sel1 = _mm_and_ps (_pM128(a), _pM128(b)); -+ sel2 = _mm_andnot_ps (_pM128(a), _pM128(c)); -+ sel1 = _mm_or_ps (sel1, sel2); -+ _M64f(res64, sel1); -+ return res64; -+} -+ -+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 -+#define vbsl_p8 vbsl_s8 -+ -+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 -+#define vbsl_p16 vbsl_s8 -+ -+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 -+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0 -+{ -+ __m128i sel1, sel2; -+ sel1 = _mm_and_si128 (a, b); -+ sel2 = _mm_andnot_si128 (a, c); -+ return _mm_or_si128 (sel1, sel2); -+} -+ -+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 -+#define vbslq_s16 vbslq_s8 -+ -+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 -+#define vbslq_s32 vbslq_s8 -+ -+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 -+#define vbslq_s64 vbslq_s8 -+ -+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 -+#define vbslq_u8 vbslq_s8 -+ -+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 -+#define vbslq_u16 vbslq_s8 -+ -+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 -+#define vbslq_u32 vbslq_s8 -+ -+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 -+#define vbslq_u64 vbslq_s8 -+ -+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 -+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0 -+{ -+ __m128 sel1, sel2; -+ sel1 = _mm_and_ps (*(__m128*)&a, b); -+ sel2 = _mm_andnot_ps (*(__m128*)&a, c); -+ return _mm_or_ps (sel1, sel2); -+} -+ -+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 -+#define vbslq_p8 vbslq_u8 -+ -+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 -+#define vbslq_p16 vbslq_s8 -+ -+//************************************************************************************ -+//**************** Transposition operations **************************************** -+//************************************************************************************ -+//***************** Vector Transpose ************************************************ -+//************************************************************************************ -+//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices. -+// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....) -+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 -+_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0 -+{ -+ int8x8x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp -+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7) -+ vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6), -+ return val; -+} -+ -+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 -+_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0 -+{ -+ int16x4x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15}; -+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3 -+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2), -+ return val; -+} -+ -+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 -+_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b) -+{ -+ int32x2x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1 -+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0, -+ return val; -+} -+ -+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 -+#define vtrn_u8 vtrn_s8 -+ -+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 -+#define vtrn_u16 vtrn_s16 -+ -+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 -+#define vtrn_u32 vtrn_s32 -+ -+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 -+_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b) -+{ -+ float32x2x2_t val; -+ val.val[0].m64_f32[0] = a.m64_f32[0]; -+ val.val[0].m64_f32[1] = b.m64_f32[0]; -+ val.val[1].m64_f32[0] = a.m64_f32[1]; -+ val.val[1].m64_f32[1] = b.m64_f32[1]; -+ return val; //a0,b0,a1,b1 -+} -+ -+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 -+#define vtrn_p8 vtrn_u8 -+ -+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 -+#define vtrn_p16 vtrn_s16 -+ -+//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 -+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0 -+{ -+ int8x16x2_t r8x16; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 -+ -+ r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14) -+ r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15) -+ return r8x16; -+} -+ -+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 -+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0 -+{ -+ int16x8x2_t v16x8; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 -+ v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6 -+ v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7 -+ return v16x8; -+} -+ -+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 -+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0 -+{ -+ //may be not optimal solution compared with serial -+ int32x4x2_t v32x4; -+ __m128i a_sh, b_sh; -+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 -+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 -+ -+ v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2 -+ v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3 -+ return v32x4; -+} -+ -+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 -+#define vtrnq_u8 vtrnq_s8 -+ -+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 -+#define vtrnq_u16 vtrnq_s16 -+ -+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 -+#define vtrnq_u32 vtrnq_s32 -+ -+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 -+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0 -+{ -+ //may be not optimal solution compared with serial -+ float32x4x2_t f32x4; -+ __m128 a_sh, b_sh; -+ a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness -+ b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness -+ -+ f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2 -+ f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3 -+ return f32x4; -+} -+ -+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 -+#define vtrnq_p8 vtrnq_s8 -+ -+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 -+#define vtrnq_p16 vtrnq_s16 -+ -+//***************** Interleave elements *************************** -+//***************************************************************** -+//output has (a0,b0,a1,b1, a2,b2,.....) -+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 -+_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0 -+{ -+ int8x8x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); -+ vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 -+_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0 -+{ -+ int16x4x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); -+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 -+#define vzip_s32 vtrn_s32 -+ -+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 -+#define vzip_u8 vzip_s8 -+ -+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 -+#define vzip_u16 vzip_s16 -+ -+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 -+#define vzip_u32 vzip_s32 -+ -+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 -+#define vzip_f32 vtrn_f32 -+ -+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 -+#define vzip_p8 vzip_u8 -+ -+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 -+#define vzip_p16 vzip_u16 -+ -+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 -+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0 -+{ -+ int8x16x2_t r8x16; -+ r8x16.val[0] = _mm_unpacklo_epi8(a, b); -+ r8x16.val[1] = _mm_unpackhi_epi8(a, b); -+ return r8x16; -+} -+ -+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 -+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0 -+{ -+ int16x8x2_t r16x8; -+ r16x8.val[0] = _mm_unpacklo_epi16(a, b); -+ r16x8.val[1] = _mm_unpackhi_epi16(a, b); -+ return r16x8; -+} -+ -+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 -+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0 -+{ -+ int32x4x2_t r32x4; -+ r32x4.val[0] = _mm_unpacklo_epi32(a, b); -+ r32x4.val[1] = _mm_unpackhi_epi32(a, b); -+ return r32x4; -+} -+ -+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 -+#define vzipq_u8 vzipq_s8 -+ -+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 -+#define vzipq_u16 vzipq_s16 -+ -+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 -+#define vzipq_u32 vzipq_s32 -+ -+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 -+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0 -+{ -+ float32x4x2_t f32x4; -+ f32x4.val[0] = _mm_unpacklo_ps ( a, b); -+ f32x4.val[1] = _mm_unpackhi_ps ( a, b); -+ return f32x4; -+} -+ -+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 -+#define vzipq_p8 vzipq_u8 -+ -+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 -+#define vzipq_p16 vzipq_u16 -+ -+//*********************** De-Interleave elements ************************* -+//************************************************************************* -+//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...) -+//no such functions in IA32 SIMD, shuffle is required -+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 -+_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0 -+{ -+ int8x8x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15}; -+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7) -+ vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 -+_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0 -+{ -+ int16x4x2_t val; -+ __m128i tmp, val0; -+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; -+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 -+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3 -+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 -+_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0 -+{ -+ int32x2x2_t val; -+ __m128i val0; -+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1 -+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); -+ return val; -+} -+ -+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 -+#define vuzp_u8 vuzp_s8 -+ -+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 -+#define vuzp_u16 vuzp_s16 -+ -+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 -+#define vuzp_u32 vuzp_s32 -+ -+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 -+#define vuzp_f32 vzip_f32 -+ -+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 -+#define vuzp_p8 vuzp_u8 -+ -+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 -+#define vuzp_p16 vuzp_u16 -+ -+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 -+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0 -+{ -+ int8x16x2_t v8x16; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 -+ //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b -+ v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14, -+ v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15 -+ return v8x16; -+} -+ -+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 -+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0 -+{ -+ int16x8x2_t v16x8; -+ __m128i a_sh, b_sh; -+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; -+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 -+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 -+ v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6 -+ v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7 -+ return v16x8; -+} -+ -+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 -+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0 -+{ -+ //may be not optimal solution compared with serial -+ int32x4x2_t v32x4; -+ __m128i a_sh, b_sh; -+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 -+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 -+ -+ v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2 -+ v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3 -+ return v32x4; -+} -+ -+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 -+#define vuzpq_u8 vuzpq_s8 -+ -+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 -+#define vuzpq_u16 vuzpq_s16 -+ -+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 -+#define vuzpq_u32 vuzpq_s32 -+ -+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 -+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0 -+{ -+ float32x4x2_t v32x4; -+ v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however -+ v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however -+ return v32x4; -+} -+ -+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 -+#define vuzpq_p8 vuzpq_u8 -+ -+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 -+#define vuzpq_p16 vuzpq_u16 -+ -+//############################################################################################## -+//*********************** Reinterpret cast intrinsics.****************************************** -+//############################################################################################## -+// Not a part of oficial NEON instruction set but available in gcc compiler ********************* -+poly8x8_t vreinterpret_p8_u32 (uint32x2_t t); -+#define vreinterpret_p8_u32 -+ -+poly8x8_t vreinterpret_p8_u16 (uint16x4_t t); -+#define vreinterpret_p8_u16 -+ -+poly8x8_t vreinterpret_p8_u8 (uint8x8_t t); -+#define vreinterpret_p8_u8 -+ -+poly8x8_t vreinterpret_p8_s32 (int32x2_t t); -+#define vreinterpret_p8_s32 -+ -+poly8x8_t vreinterpret_p8_s16 (int16x4_t t); -+#define vreinterpret_p8_s16 -+ -+poly8x8_t vreinterpret_p8_s8 (int8x8_t t); -+#define vreinterpret_p8_s8 -+ -+poly8x8_t vreinterpret_p8_u64 (uint64x1_t t); -+#define vreinterpret_p8_u64 -+ -+poly8x8_t vreinterpret_p8_s64 (int64x1_t t); -+#define vreinterpret_p8_s64 -+ -+poly8x8_t vreinterpret_p8_f32 (float32x2_t t); -+#define vreinterpret_p8_f32 -+ -+poly8x8_t vreinterpret_p8_p16 (poly16x4_t t); -+#define vreinterpret_p8_p16 -+ -+poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t); -+#define vreinterpretq_p8_u32 -+ -+poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t); -+#define vreinterpretq_p8_u16 -+ -+poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t); -+#define vreinterpretq_p8_u8 -+ -+poly8x16_t vreinterpretq_p8_s32 (int32x4_t t); -+#define vreinterpretq_p8_s32 -+ -+poly8x16_t vreinterpretq_p8_s16 (int16x8_t t); -+#define vreinterpretq_p8_s16 -+ -+poly8x16_t vreinterpretq_p8_s8 (int8x16_t t); -+#define vreinterpretq_p8_s8 -+ -+poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t); -+#define vreinterpretq_p8_u64 -+ -+poly8x16_t vreinterpretq_p8_s64 (int64x2_t t); -+#define vreinterpretq_p8_s64 -+ -+poly8x16_t vreinterpretq_p8_f32 (float32x4_t t); -+#define vreinterpretq_p8_f32(t) _M128i(t) -+ -+poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t); -+#define vreinterpretq_p8_p16 -+ -+poly16x4_t vreinterpret_p16_u32 (uint32x2_t t); -+#define vreinterpret_p16_u32 -+ -+poly16x4_t vreinterpret_p16_u16 (uint16x4_t t); -+#define vreinterpret_p16_u16 -+ -+poly16x4_t vreinterpret_p16_u8 (uint8x8_t t); -+#define vreinterpret_p16_u8 -+ -+poly16x4_t vreinterpret_p16_s32 (int32x2_t t); -+#define vreinterpret_p16_s32 -+ -+poly16x4_t vreinterpret_p16_s16 (int16x4_t t); -+#define vreinterpret_p16_s16 -+ -+poly16x4_t vreinterpret_p16_s8 (int8x8_t t); -+#define vreinterpret_p16_s8 -+ -+poly16x4_t vreinterpret_p16_u64 (uint64x1_t t); -+#define vreinterpret_p16_u64 -+ -+poly16x4_t vreinterpret_p16_s64 (int64x1_t t); -+#define vreinterpret_p16_s64 -+ -+poly16x4_t vreinterpret_p16_f32 (float32x2_t t); -+#define vreinterpret_p16_f32 -+ -+poly16x4_t vreinterpret_p16_p8 (poly8x8_t t); -+#define vreinterpret_p16_p8 -+ -+poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t); -+#define vreinterpretq_p16_u32 -+ -+poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t); -+#define vreinterpretq_p16_u16 -+ -+poly16x8_t vreinterpretq_p16_s32 (int32x4_t t); -+#define vreinterpretq_p16_s32 -+ -+poly16x8_t vreinterpretq_p16_s16 (int16x8_t t); -+#define vreinterpretq_p16_s16 -+ -+poly16x8_t vreinterpretq_p16_s8 (int8x16_t t); -+#define vreinterpretq_p16_s8 -+ -+poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t); -+#define vreinterpretq_p16_u64 -+ -+poly16x8_t vreinterpretq_p16_s64 (int64x2_t t); -+#define vreinterpretq_p16_s64 -+ -+poly16x8_t vreinterpretq_p16_f32 (float32x4_t t); -+#define vreinterpretq_p16_f32(t) _M128i(t) -+ -+poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t); -+#define vreinterpretq_p16_p8 vreinterpretq_s16_p8 -+ -+//**** Integer to float ****** -+float32x2_t vreinterpret_f32_u32 (uint32x2_t t); -+#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t)) -+ -+ -+float32x2_t vreinterpret_f32_u16 (uint16x4_t t); -+#define vreinterpret_f32_u16 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_u8 (uint8x8_t t); -+#define vreinterpret_f32_u8 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_s32 (int32x2_t t); -+#define vreinterpret_f32_s32 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_s16 (int16x4_t t); -+#define vreinterpret_f32_s16 vreinterpret_f32_u32 -+ -+float32x2_t vreinterpret_f32_s8 (int8x8_t t); -+#define vreinterpret_f32_s8 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_u64(uint64x1_t t); -+#define vreinterpret_f32_u64 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_s64 (int64x1_t t); -+#define vreinterpret_f32_s64 vreinterpret_f32_u32 -+ -+ -+float32x2_t vreinterpret_f32_p16 (poly16x4_t t); -+#define vreinterpret_f32_p16 vreinterpret_f32_u32 -+ -+float32x2_t vreinterpret_f32_p8 (poly8x8_t t); -+#define vreinterpret_f32_p8 vreinterpret_f32_u32 -+ -+float32x4_t vreinterpretq_f32_u32 (uint32x4_t t); -+#define vreinterpretq_f32_u32(t) *(__m128*)&(t) -+ -+float32x4_t vreinterpretq_f32_u16 (uint16x8_t t); -+#define vreinterpretq_f32_u16 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_u8 (uint8x16_t t); -+#define vreinterpretq_f32_u8 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s32 (int32x4_t t); -+#define vreinterpretq_f32_s32 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s16 (int16x8_t t); -+#define vreinterpretq_f32_s16 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s8 (int8x16_t t); -+#define vreinterpretq_f32_s8 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_u64 (uint64x2_t t); -+#define vreinterpretq_f32_u64 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_s64 (int64x2_t t); -+#define vreinterpretq_f32_s64 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_p16 (poly16x8_t t); -+#define vreinterpretq_f32_p16 vreinterpretq_f32_u32 -+ -+float32x4_t vreinterpretq_f32_p8 (poly8x16_t t); -+#define vreinterpretq_f32_p8 vreinterpretq_f32_u32 -+ -+//*** Integer type conversions ****************** -+//no conversion necessary for the following functions because it is same data type -+int64x1_t vreinterpret_s64_u32 (uint32x2_t t); -+#define vreinterpret_s64_u32 -+ -+int64x1_t vreinterpret_s64_u16 (uint16x4_t t); -+#define vreinterpret_s64_u16 -+ -+int64x1_t vreinterpret_s64_u8 (uint8x8_t t); -+#define vreinterpret_s64_u8 -+ -+int64x1_t vreinterpret_s64_s32 (int32x2_t t); -+#define vreinterpret_s64_s32 -+ -+int64x1_t vreinterpret_s64_s16 (int16x4_t t); -+#define vreinterpret_s64_s16 -+ -+int64x1_t vreinterpret_s64_s8 (int8x8_t t); -+#define vreinterpret_s64_s8 -+ -+int64x1_t vreinterpret_s64_u64 (uint64x1_t t); -+#define vreinterpret_s64_u64 -+ -+int64x1_t vreinterpret_s64_f32 (float32x2_t t); -+#define vreinterpret_s64_f32 -+ -+int64x1_t vreinterpret_s64_p16 (poly16x4_t t); -+#define vreinterpret_s64_p16 -+ -+int64x1_t vreinterpret_s64_p8 (poly8x8_t t); -+#define vreinterpret_s64_p8 -+ -+int64x2_t vreinterpretq_s64_u32 (uint32x4_t t); -+#define vreinterpretq_s64_u32 -+ -+int64x2_t vreinterpretq_s64_s16 (uint16x8_t t); -+#define vreinterpretq_s64_s16 -+ -+int64x2_t vreinterpretq_s64_u8 (uint8x16_t t); -+#define vreinterpretq_s64_u8 -+ -+int64x2_t vreinterpretq_s64_s32 (int32x4_t t); -+#define vreinterpretq_s64_s32 -+ -+int64x2_t vreinterpretq_s64_u16 (int16x8_t t); -+#define vreinterpretq_s64_u16 -+ -+int64x2_t vreinterpretq_s64_s8 (int8x16_t t); -+#define vreinterpretq_s64_s8 -+ -+int64x2_t vreinterpretq_s64_u64 (uint64x2_t t); -+#define vreinterpretq_s64_u64 -+ -+int64x2_t vreinterpretq_s64_f32 (float32x4_t t); -+#define vreinterpretq_s64_f32(t) _M128i(t) -+ -+int64x2_t vreinterpretq_s64_p16 (poly16x8_t t); -+#define vreinterpretq_s64_p16 -+ -+int64x2_t vreinterpretq_s64_p8 (poly8x16_t t); -+#define vreinterpretq_s64_p8 -+ -+uint64x1_t vreinterpret_u64_u32 (uint32x2_t t); -+#define vreinterpret_u64_u32 -+ -+uint64x1_t vreinterpret_u64_u16 (uint16x4_t t); -+#define vreinterpret_u64_u16 -+ -+uint64x1_t vreinterpret_u64_u8 (uint8x8_t t); -+#define vreinterpret_u64_u8 -+ -+uint64x1_t vreinterpret_u64_s32 (int32x2_t t); -+#define vreinterpret_u64_s32 -+ -+uint64x1_t vreinterpret_u64_s16 (int16x4_t t); -+#define vreinterpret_u64_s16 -+ -+uint64x1_t vreinterpret_u64_s8 (int8x8_t t); -+#define vreinterpret_u64_s8 -+ -+uint64x1_t vreinterpret_u64_s64 (int64x1_t t); -+#define vreinterpret_u64_s64 -+ -+uint64x1_t vreinterpret_u64_f32 (float32x2_t t); -+#define vreinterpret_u64_f32 -+ -+uint64x1_t vreinterpret_u64_p16 (poly16x4_t t); -+#define vreinterpret_u64_p16 -+ -+uint64x1_t vreinterpret_u64_p8 (poly8x8_t t); -+#define vreinterpret_u64_p8 -+ -+uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t); -+#define vreinterpretq_u64_u32 -+ -+uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t); -+#define vreinterpretq_u64_u16 -+ -+uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t); -+#define vreinterpretq_u64_u8 -+ -+uint64x2_t vreinterpretq_u64_s32 (int32x4_t t); -+#define vreinterpretq_u64_s32 -+ -+uint64x2_t vreinterpretq_u64_s16 (int16x8_t t); -+#define vreinterpretq_u64_s16 -+ -+uint64x2_t vreinterpretq_u64_s8 (int8x16_t t); -+#define vreinterpretq_u64_s8 -+ -+uint64x2_t vreinterpretq_u64_s64 (int64x2_t t); -+#define vreinterpretq_u64_s64 -+ -+uint64x2_t vreinterpretq_u64_f32 (float32x4_t t); -+#define vreinterpretq_u64_f32(t) _M128i(t) -+ -+uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t); -+#define vreinterpretq_u64_p16 -+ -+uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t); -+#define vreinterpretq_u64_p8 -+ -+int8x8_t vreinterpret_s8_u32 (uint32x2_t t); -+#define vreinterpret_s8_u32 -+ -+int8x8_t vreinterpret_s8_u16 (uint16x4_t t); -+#define vreinterpret_s8_u16 -+ -+int8x8_t vreinterpret_s8_u8 (uint8x8_t t); -+#define vreinterpret_s8_u8 -+ -+int8x8_t vreinterpret_s8_s32 (int32x2_t t); -+#define vreinterpret_s8_s32 -+ -+int8x8_t vreinterpret_s8_s16 (int16x4_t t); -+#define vreinterpret_s8_s16 -+ -+int8x8_t vreinterpret_s8_u64 (uint64x1_t t); -+#define vreinterpret_s8_u64 -+ -+int8x8_t vreinterpret_s8_s64 (int64x1_t t); -+#define vreinterpret_s8_s64 -+ -+int8x8_t vreinterpret_s8_f32 (float32x2_t t); -+#define vreinterpret_s8_f32 -+ -+int8x8_t vreinterpret_s8_p16 (poly16x4_t t); -+#define vreinterpret_s8_p16 -+ -+int8x8_t vreinterpret_s8_p8 (poly8x8_t t); -+#define vreinterpret_s8_p8 -+ -+int8x16_t vreinterpretq_s8_u32 (uint32x4_t t); -+#define vreinterpretq_s8_u32 -+ -+int8x16_t vreinterpretq_s8_u16 (uint16x8_t t); -+#define vreinterpretq_s8_u16 -+ -+int8x16_t vreinterpretq_s8_u8 (uint8x16_t t); -+#define vreinterpretq_s8_u8 -+ -+int8x16_t vreinterpretq_s8_s32 (int32x4_t t); -+#define vreinterpretq_s8_s32 -+ -+int8x16_t vreinterpretq_s8_s16 (int16x8_t t); -+#define vreinterpretq_s8_s16 -+ -+int8x16_t vreinterpretq_s8_u64 (uint64x2_t t); -+#define vreinterpretq_s8_u64 -+ -+int8x16_t vreinterpretq_s8_s64 (int64x2_t t); -+#define vreinterpretq_s8_s64 -+ -+int8x16_t vreinterpretq_s8_f32 (float32x4_t t); -+#define vreinterpretq_s8_f32(t) _M128i(t) -+ -+int8x16_t vreinterpretq_s8_p16 (poly16x8_t t); -+#define vreinterpretq_s8_p16 -+ -+int8x16_t vreinterpretq_s8_p8 (poly8x16_t t); -+#define vreinterpretq_s8_p8 -+ -+int16x4_t vreinterpret_s16_u32 (uint32x2_t t); -+#define vreinterpret_s16_u32 -+ -+int16x4_t vreinterpret_s16_u16 (uint16x4_t t); -+#define vreinterpret_s16_u16 -+ -+int16x4_t vreinterpret_s16_u8 (uint8x8_t t); -+#define vreinterpret_s16_u8 -+ -+int16x4_t vreinterpret_s16_s32 (int32x2_t t); -+#define vreinterpret_s16_s32 -+ -+int16x4_t vreinterpret_s16_s8 (int8x8_t t); -+#define vreinterpret_s16_s8 -+ -+int16x4_t vreinterpret_s16_u64 (uint64x1_t t); -+#define vreinterpret_s16_u64 -+ -+int16x4_t vreinterpret_s16_s64 (int64x1_t t); -+#define vreinterpret_s16_s64 -+ -+int16x4_t vreinterpret_s16_f32 (float32x2_t t); -+#define vreinterpret_s16_f32 -+ -+ -+int16x4_t vreinterpret_s16_p16 (poly16x4_t t); -+#define vreinterpret_s16_p16 -+ -+int16x4_t vreinterpret_s16_p8 (poly8x8_t t); -+#define vreinterpret_s16_p8 -+ -+int16x8_t vreinterpretq_s16_u32 (uint32x4_t t); -+#define vreinterpretq_s16_u32 -+ -+int16x8_t vreinterpretq_s16_u16 (uint16x8_t t); -+#define vreinterpretq_s16_u16 -+ -+int16x8_t vreinterpretq_s16_u8 (uint8x16_t t); -+#define vreinterpretq_s16_u8 -+ -+int16x8_t vreinterpretq_s16_s32 (int32x4_t t); -+#define vreinterpretq_s16_s32 -+ -+int16x8_t vreinterpretq_s16_s8 (int8x16_t t); -+#define vreinterpretq_s16_s8 -+ -+int16x8_t vreinterpretq_s16_u64 (uint64x2_t t); -+#define vreinterpretq_s16_u64 -+ -+int16x8_t vreinterpretq_s16_s64 (int64x2_t t); -+#define vreinterpretq_s16_s64 -+ -+int16x8_t vreinterpretq_s16_f32 (float32x4_t t); -+#define vreinterpretq_s16_f32(t) _M128i(t) -+ -+int16x8_t vreinterpretq_s16_p16 (poly16x8_t t); -+#define vreinterpretq_s16_p16 -+ -+int16x8_t vreinterpretq_s16_p8 (poly8x16_t t); -+#define vreinterpretq_s16_p8 -+ -+int32x2_t vreinterpret_s32_u32 (uint32x2_t t); -+#define vreinterpret_s32_u32 -+ -+int32x2_t vreinterpret_s32_u16 (uint16x4_t t); -+#define vreinterpret_s32_u16 -+ -+int32x2_t vreinterpret_s32_u8 (uint8x8_t t); -+#define vreinterpret_s32_u8 -+ -+int32x2_t vreinterpret_s32_s16 (int16x4_t t); -+#define vreinterpret_s32_s16 -+ -+int32x2_t vreinterpret_s32_s8 (int8x8_t t); -+#define vreinterpret_s32_s8 -+ -+int32x2_t vreinterpret_s32_u64 (uint64x1_t t); -+#define vreinterpret_s32_u64 -+ -+int32x2_t vreinterpret_s32_s64 (int64x1_t t); -+#define vreinterpret_s32_s64 -+ -+int32x2_t vreinterpret_s32_f32 (float32x2_t t); -+#define vreinterpret_s32_f32 -+ -+int32x2_t vreinterpret_s32_p16 (poly16x4_t t); -+#define vreinterpret_s32_p16 -+ -+int32x2_t vreinterpret_s32_p8 (poly8x8_t t); -+#define vreinterpret_s32_p8 -+ -+int32x4_t vreinterpretq_s32_u32 (uint32x4_t t); -+#define vreinterpretq_s32_u32 -+ -+int32x4_t vreinterpretq_s32_u16 (uint16x8_t t); -+#define vreinterpretq_s32_u16 -+ -+int32x4_t vreinterpretq_s32_u8 (uint8x16_t t); -+#define vreinterpretq_s32_u8 -+ -+int32x4_t vreinterpretq_s32_s16 (int16x8_t t); -+#define vreinterpretq_s32_s16 -+ -+int32x4_t vreinterpretq_s32_s8 (int8x16_t t); -+#define vreinterpretq_s32_s8 -+ -+int32x4_t vreinterpretq_s32_u64 (uint64x2_t t); -+#define vreinterpretq_s32_u64 -+ -+int32x4_t vreinterpretq_s32_s64 (int64x2_t t); -+#define vreinterpretq_s32_s64 -+ -+int32x4_t vreinterpretq_s32_f32 (float32x4_t t); -+#define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t)) -+ -+int32x4_t vreinterpretq_s32_p16 (poly16x8_t t); -+#define vreinterpretq_s32_p16 -+ -+int32x4_t vreinterpretq_s32_p8 (poly8x16_t t); -+#define vreinterpretq_s32_p8 -+ -+uint8x8_t vreinterpret_u8_u32 (uint32x2_t t); -+#define vreinterpret_u8_u32 -+ -+uint8x8_t vreinterpret_u8_u16 (uint16x4_t t); -+#define vreinterpret_u8_u16 -+ -+uint8x8_t vreinterpret_u8_s32 (int32x2_t t); -+#define vreinterpret_u8_s32 -+ -+uint8x8_t vreinterpret_u8_s16 (int16x4_t t); -+#define vreinterpret_u8_s16 -+ -+uint8x8_t vreinterpret_u8_s8 (int8x8_t t); -+#define vreinterpret_u8_s8 -+ -+uint8x8_t vreinterpret_u8_u64 (uint64x1_t t); -+#define vreinterpret_u8_u64 -+ -+uint8x8_t vreinterpret_u8_s64 (int64x1_t t); -+#define vreinterpret_u8_s64 -+ -+uint8x8_t vreinterpret_u8_f32 (float32x2_t t); -+#define vreinterpret_u8_f32 -+ -+uint8x8_t vreinterpret_u8_p16 (poly16x4_t t); -+#define vreinterpret_u8_p16 -+ -+uint8x8_t vreinterpret_u8_p8 (poly8x8_t t); -+#define vreinterpret_u8_p8 -+ -+uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t); -+#define vreinterpretq_u8_u32 -+ -+uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t); -+#define vreinterpretq_u8_u16 -+ -+uint8x16_t vreinterpretq_u8_s32 (int32x4_t t); -+#define vreinterpretq_u8_s32 -+ -+uint8x16_t vreinterpretq_u8_s16 (int16x8_t t); -+#define vreinterpretq_u8_s16 -+ -+uint8x16_t vreinterpretq_u8_s8 (int8x16_t t); -+#define vreinterpretq_u8_s8 -+ -+uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t); -+#define vreinterpretq_u8_u64 -+ -+uint8x16_t vreinterpretq_u8_s64 (int64x2_t t); -+#define vreinterpretq_u8_s64 -+ -+uint8x16_t vreinterpretq_u8_f32 (float32x4_t t); -+#define vreinterpretq_u8_f32(t) _M128i(t) -+ -+ -+uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t); -+#define vreinterpretq_u8_p16 -+ -+uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t); -+#define vreinterpretq_u8_p8 -+ -+uint16x4_t vreinterpret_u16_u32 (uint32x2_t t); -+#define vreinterpret_u16_u32 -+ -+uint16x4_t vreinterpret_u16_u8 (uint8x8_t t); -+#define vreinterpret_u16_u8 -+ -+uint16x4_t vreinterpret_u16_s32 (int32x2_t t); -+#define vreinterpret_u16_s32 -+ -+uint16x4_t vreinterpret_u16_s16 (int16x4_t t); -+#define vreinterpret_u16_s16 -+ -+uint16x4_t vreinterpret_u16_s8 (int8x8_t t); -+#define vreinterpret_u16_s8 -+ -+uint16x4_t vreinterpret_u16_u64 (uint64x1_t t); -+#define vreinterpret_u16_u64 -+ -+uint16x4_t vreinterpret_u16_s64 (int64x1_t t); -+#define vreinterpret_u16_s64 -+ -+uint16x4_t vreinterpret_u16_f32 (float32x2_t t); -+#define vreinterpret_u16_f32 -+ -+uint16x4_t vreinterpret_u16_p16 (poly16x4_t t); -+#define vreinterpret_u16_p16 -+ -+uint16x4_t vreinterpret_u16_p8 (poly8x8_t t); -+#define vreinterpret_u16_p8 -+ -+uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t); -+#define vreinterpretq_u16_u32 -+ -+uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t); -+#define vreinterpretq_u16_u8 -+ -+uint16x8_t vreinterpretq_u16_s32 (int32x4_t t); -+#define vreinterpretq_u16_s32 -+ -+uint16x8_t vreinterpretq_u16_s16 (int16x8_t t); -+#define vreinterpretq_u16_s16 -+ -+uint16x8_t vreinterpretq_u16_s8 (int8x16_t t); -+#define vreinterpretq_u16_s8 -+ -+uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t); -+#define vreinterpretq_u16_u64 -+ -+uint16x8_t vreinterpretq_u16_s64 (int64x2_t t); -+#define vreinterpretq_u16_s64 -+ -+uint16x8_t vreinterpretq_u16_f32 (float32x4_t t); -+#define vreinterpretq_u16_f32(t) _M128i(t) -+ -+uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t); -+#define vreinterpretq_u16_p16 -+ -+uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t); -+#define vreinterpretq_u16_p8 -+ -+uint32x2_t vreinterpret_u32_u16 (uint16x4_t t); -+#define vreinterpret_u32_u16 -+ -+uint32x2_t vreinterpret_u32_u8 (uint8x8_t t); -+#define vreinterpret_u32_u8 -+ -+uint32x2_t vreinterpret_u32_s32 (int32x2_t t); -+#define vreinterpret_u32_s32 -+ -+uint32x2_t vreinterpret_u32_s16 (int16x4_t t); -+#define vreinterpret_u32_s16 -+ -+uint32x2_t vreinterpret_u32_s8 (int8x8_t t); -+#define vreinterpret_u32_s8 -+ -+uint32x2_t vreinterpret_u32_u64 (uint64x1_t t); -+#define vreinterpret_u32_u64 -+ -+uint32x2_t vreinterpret_u32_s64 (int64x1_t t); -+#define vreinterpret_u32_s64 -+ -+uint32x2_t vreinterpret_u32_f32 (float32x2_t t); -+#define vreinterpret_u32_f32 -+ -+uint32x2_t vreinterpret_u32_p16 (poly16x4_t t); -+#define vreinterpret_u32_p16 -+ -+uint32x2_t vreinterpret_u32_p8 (poly8x8_t t); -+#define vreinterpret_u32_p8 -+ -+uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t); -+#define vreinterpretq_u32_u16 -+ -+uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t); -+#define vreinterpretq_u32_u8 -+ -+uint32x4_t vreinterpretq_u32_s32 (int32x4_t t); -+#define vreinterpretq_u32_s32 -+ -+uint32x4_t vreinterpretq_u32_s16 (int16x8_t t); -+#define vreinterpretq_u32_s16 -+ -+uint32x4_t vreinterpretq_u32_s8 (int8x16_t t); -+#define vreinterpretq_u32_s8 -+ -+uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t); -+#define vreinterpretq_u32_u64 -+ -+uint32x4_t vreinterpretq_u32_s64 (int64x2_t t); -+#define vreinterpretq_u32_s64 -+ -+uint32x4_t vreinterpretq_u32_f32 (float32x4_t t); -+#define vreinterpretq_u32_f32(t) _M128i(t) -+ -+uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t); -+#define vreinterpretq_u32_p16 -+ -+uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); -+#define vreinterpretq_u32_p8 -+ -+#endif /* NEON2SSE_H */ -diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h -index fee33a3..22fb2ce 100644 ---- a/gcc/config/i386/gnu-user.h -+++ b/gcc/config/i386/gnu-user.h -@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see - When the -shared link option is used a final link is not being - done. */ - -+#undef ANDROID_TARGET_CC1_SPEC -+#define ANDROID_TARGET_CC1_SPEC \ -+ " -mssse3 -fno-short-enums " \ -+ - #undef ASM_SPEC - #define ASM_SPEC \ -- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" -+ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) - - #undef SUBTARGET_EXTRA_SPECS - #define SUBTARGET_EXTRA_SPECS \ -diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h -index 7a02a7e..cac4179 100644 ---- a/gcc/config/i386/gnu-user64.h -+++ b/gcc/config/i386/gnu-user64.h -@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define SPEC_X32 "mx32" - #endif - -+#undef ANDROID_TARGET_CC1_SPEC -+#define ANDROID_TARGET_CC1_SPEC \ -+ "%{m32:-mssse3 -fno-short-enums}" \ -+ "%{!m32:-msse4.2 -mpopcnt}" -+ - #undef ASM_SPEC - #define ASM_SPEC "%{" SPEC_32 ":--32} \ - %{" SPEC_64 ":--64} \ -diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c -index 3d044e8..5c89fca 100644 ---- a/gcc/config/i386/i386.c -+++ b/gcc/config/i386/i386.c -@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp) - else if (!SYMBOL_REF_FAR_ADDR_P (op0) - && (SYMBOL_REF_LOCAL_P (op0) - || (HAVE_LD_PIE_COPYRELOC -+ && !TARGET_HAS_BIONIC - && flag_pie - && !SYMBOL_REF_WEAK (op0) - && !SYMBOL_REF_FUNCTION_P (op0))) -diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h -index 4b9910f..c33e074 100644 ---- a/gcc/config/i386/linux-common.h -+++ b/gcc/config/i386/linux-common.h -@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see - #undef CC1_SPEC - #define CC1_SPEC \ - LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ -- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) -+ GNU_USER_TARGET_CC1_SPEC \ -+ ANDROID_TARGET_CC1_SPEC \ -+ " " \ -+ ANDROID_CC1_SPEC("-fPIC")) -+ -+#define CC1PLUS_SPEC \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) - - #undef LINK_SPEC - #define LINK_SPEC \ -diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h -index a1f98d3..3725799 100644 ---- a/gcc/config/i386/pmm_malloc.h -+++ b/gcc/config/i386/pmm_malloc.h -@@ -31,7 +31,7 @@ - #ifndef __cplusplus - extern int posix_memalign (void **, size_t, size_t); - #else --extern "C" int posix_memalign (void **, size_t, size_t) throw (); -+extern "C" int posix_memalign (void **, size_t, size_t); - #endif - - static __inline void * -diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h -index 301a41c..bebfe7f 100644 ---- a/gcc/config/linux-android.h -+++ b/gcc/config/linux-android.h -@@ -38,15 +38,18 @@ - "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}" - - #define ANDROID_LINK_SPEC \ -- "%{shared: -Bsymbolic}" -+ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now" - --#define ANDROID_CC1_SPEC \ -+#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \ - "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \ -- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}" -+ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}" - - #define ANDROID_CC1PLUS_SPEC \ -- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \ -- "%{!frtti:%{!fno-rtti: -fno-rtti}}" -+ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \ -+ "%{!frtti:%{!fno-rtti: -frtti}}" -+ -+#define ANDROID_ASM_SPEC \ -+ "--noexecstack" - - #define ANDROID_LIB_SPEC \ - "%{!static: -ldl}" -diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h -new file mode 100644 -index 0000000..32c539c ---- /dev/null -+++ b/gcc/config/mips/android.h -@@ -0,0 +1,49 @@ -+/* Target macros for mips*-*android* targets. -+ Copyright (C) 2014 Free Software Foundation, Inc. -+ -+This file is part of GCC. -+ -+GCC is free software; you can redistribute it and/or modify -+it under the terms of the GNU General Public License as published by -+the Free Software Foundation; either version 3, or (at your option) -+any later version. -+ -+GCC is distributed in the hope that it will be useful, -+but WITHOUT ANY WARRANTY; without even the implied warranty of -+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+GNU General Public License for more details. -+ -+You should have received a copy of the GNU General Public License -+along with GCC; see the file COPYING3. If not see -+. */ -+ -+#undef DRIVER_SELF_SPECS -+#define DRIVER_SELF_SPECS \ -+ /* Make sure a -mips option is present. This helps us to pick \ -+ the right multilib, and also makes the later specs easier \ -+ to write. */ \ -+ MIPS_ISA_LEVEL_SPEC, \ -+ \ -+ /* Infer the default float setting from -march. */ \ -+ MIPS_ARCH_FLOAT_SPEC, \ -+ \ -+ /* Infer the -msynci setting from -march if not explicitly set. */ \ -+ MIPS_ISA_SYNCI_SPEC, \ -+ \ -+ /* If no ABI option is specified, infer one from the ISA level \ -+ or -mgp setting. */ \ -+ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \ -+ \ -+ /* If no FP ABI option is specified, infer one from the \ -+ ABI/ISA level unless there is a conflicting option. */ \ -+ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \ -+ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \ -+ \ -+ /* If no odd-spreg option is specified, infer one from the ISA. */ \ -+ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \ -+ \ -+ /* Base SPECs. */ \ -+ BASE_DRIVER_SELF_SPECS, \ -+ \ -+ /* Use the standard linux specs for everything else. */ \ -+ LINUX_DRIVER_SELF_SPECS -diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h -index 15b549c..4a28160 100644 ---- a/gcc/config/mips/gnu-user.h -+++ b/gcc/config/mips/gnu-user.h -@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see - /* The GNU C++ standard library requires this. */ \ - if (c_dialect_cxx ()) \ - builtin_define ("_GNU_SOURCE"); \ -+ ANDROID_TARGET_OS_CPP_BUILTINS(); \ - } while (0) - - #undef SUBTARGET_CPP_SPEC -@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see - - #undef SUBTARGET_ASM_SPEC - #define SUBTARGET_ASM_SPEC \ -- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}" -+ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \ -+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) - - /* The MIPS assembler has different syntax for .set. We set it to - .dummy to trap any errors. */ -@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); - #endif - - #define LINUX_DRIVER_SELF_SPECS \ -- NO_SHARED_SPECS \ -+ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \ - MARCH_MTUNE_NATIVE_SPECS, \ - /* -mplt has no effect without -mno-shared. Simplify later \ - specs handling by removing a redundant option. */ \ -diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h -index 8429a7c..4b0825d 100644 ---- a/gcc/config/mips/linux-common.h -+++ b/gcc/config/mips/linux-common.h -@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see - #undef SUBTARGET_CC1_SPEC - #define SUBTARGET_CC1_SPEC \ - LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ -- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) -+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) - - #undef CC1PLUS_SPEC - #define CC1PLUS_SPEC \ -@@ -62,3 +62,7 @@ along with GCC; see the file COPYING3. If not see - - /* The default value isn't sufficient in 64-bit mode. */ - #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) -+ -+#ifdef IN_LIBGCC2 -+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) -+#endif -diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android -new file mode 100644 -index 0000000..39f512c ---- /dev/null -+++ b/gcc/config/mips/t-linux-android -@@ -0,0 +1,3 @@ -+MULTILIB_OPTIONS = mips32r2/mips32r6 -+MULTILIB_DIRNAMES = mips-r2 mips-r6 -+MULTILIB_OSDIRNAMES = ../libr2 ../libr6 -diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64 -new file mode 100644 -index 0000000..55cab7d ---- /dev/null -+++ b/gcc/config/mips/t-linux-android64 -@@ -0,0 +1,4 @@ -+MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6 -+MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6 -+MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64 -+MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6 -diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h -index 37ecfc4..a5f1b99 100644 ---- a/gcc/config/openbsd.h -+++ b/gcc/config/openbsd.h -@@ -136,8 +136,12 @@ while (0) - #define LIB_SPEC OBSD_LIB_SPEC - - #if defined(HAVE_LD_EH_FRAME_HDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " - #endif -+#endif - - #undef LIB_SPEC - #define LIB_SPEC OBSD_LIB_SPEC -diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h -index cbf9097..eb2217f 100644 ---- a/gcc/config/rs6000/sysv4.h -+++ b/gcc/config/rs6000/sysv4.h -@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN) - -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}" - - #if defined(HAVE_LD_EH_FRAME_HDR) --# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " -+# ifdef USE_EH_FRAME_HDR_FOR_STATIC -+# define LINK_EH_SPEC "--eh-frame-hdr " -+# else -+# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " -+# endif - #endif - - #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \ -diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h -index 5160e1f..7632a50 100644 ---- a/gcc/config/sol2.h -+++ b/gcc/config/sol2.h -@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see - /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs - --eh-frame-hdr to create the required .eh_frame_hdr sections. */ - #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) -+#ifdef USE_EH_FRAME_HDR_FOR_STATIC -+#define LINK_EH_SPEC "--eh-frame-hdr " -+#else - #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " -+#endif - #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */ - #endif - -diff --git a/gcc/configure b/gcc/configure -index 1c6e340..28ad050 100755 ---- a/gcc/configure -+++ b/gcc/configure -@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769 - enable_fix_cortex_a53_843419 - with_glibc_version - enable_gnu_unique_object -+enable_eh_frame_hdr_for_static - enable_linker_build_id - enable_default_ssp - with_long_double_128 -@@ -1670,6 +1671,9 @@ Optional Features: - --enable-gnu-unique-object - enable the use of the @gnu_unique_object ELF - extension on glibc systems -+ --enable-eh-frame-hdr-for-static -+ enable linker PT_GNU_EH_FRAME support for static -+ executable - --enable-linker-build-id - compiler will always pass --build-id to linker - --enable-default-ssp enable Stack Smashing Protection as default -@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then - - $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h - -+ # Check whether --enable-eh-frame-hdr-for-static was given. -+if test "${enable_eh_frame_hdr_for_static+set}" = set; then : -+ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in -+ yes | no) ;; -+ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid -+value for --enable-eh-frame-hdr-for-static. -+Valid choices are 'yes' and 'no'." "$LINENO" 5 ;; -+ esac -+else -+ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from -+# Linux kernel. -+ if test x$host = x$build -a x$host = x$target && -+ ldd --version 2>&1 >/dev/null && -+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then -+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` -+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` -+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` -+ if test "$glibcnum" -ge 2003 ; then -+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` -+ if echo "$auvx" | grep AT_PHDR > /dev/null && -+ echo "$auvx" | grep AT_PHNUM > /dev/null; then -+ enable_eh_frame_hdr_for_static=yes -+ fi -+ fi -+ fi -+fi -+ -+ if test x$enable_eh_frame_hdr_for_static = xyes; then -+ -+$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h -+ -+ fi - fi - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5 - $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; } -diff --git a/gcc/configure.ac b/gcc/configure.ac -index 6c1dcd9..0cf7419 100644 ---- a/gcc/configure.ac -+++ b/gcc/configure.ac -@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR]) - if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then - AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1, - [Define if your linker supports .eh_frame_hdr.]) -+ AC_ARG_ENABLE(eh-frame-hdr-for-static, -+ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static], -+ [enable linker PT_GNU_EH_FRAME support for static executable])], -+ [case $enable_eh_frame_hdr_for_static in -+ yes | no) ;; -+ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid -+value for --enable-eh-frame-hdr-for-static. -+Valid choices are 'yes' and 'no'.]) ;; -+ esac], -+# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from -+# Linux kernel. -+ [[if test x$host = x$build -a x$host = x$target && -+ ldd --version 2>&1 >/dev/null && -+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then -+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` -+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` -+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` -+ if test "$glibcnum" -ge 2003 ; then -+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` -+ if echo "$auvx" | grep AT_PHDR > /dev/null && -+ echo "$auvx" | grep AT_PHNUM > /dev/null; then -+ enable_eh_frame_hdr_for_static=yes -+ fi -+ fi -+ fi]]) -+ if test x$enable_eh_frame_hdr_for_static = xyes; then -+ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1, -+[Define if your system supports PT_GNU_EH_FRAME for static executable.]) -+ fi - fi - AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr) - -diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C -new file mode 100644 -index 0000000..15408ef ---- /dev/null -+++ b/gcc/testsuite/g++.dg/eh/spec3-static.C -@@ -0,0 +1,25 @@ -+// PR c++/4381 -+// Test that exception-specs work properly for classes with virtual bases. -+ -+// { dg-do run } -+// { dg-options "-static" } -+ -+class Base {}; -+ -+struct A : virtual public Base -+{ -+ A() {} -+}; -+ -+struct B {}; -+ -+void func() throw (B,A) -+{ -+ throw A(); -+} -+ -+int main(void) -+{ -+ try { func(); } -+ catch (A& a) { } -+} -diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c -index f3343fc..d426477 100644 ---- a/libgcc/crtstuff.c -+++ b/libgcc/crtstuff.c -@@ -88,7 +88,8 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) \ -- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ -+ && !defined(inhibit_libc) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(BSD_DL_ITERATE_PHDR_AVAILABLE) - #include - # define USE_PT_GNU_EH_FRAME -@@ -97,7 +98,8 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \ -- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ -+ && !defined(inhibit_libc) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(__sun__) && defined(__svr4__) - #include - # define USE_PT_GNU_EH_FRAME -@@ -106,7 +108,8 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) \ -- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ -+ && !defined(inhibit_libc) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(__GLIBC__) && __GLIBC__ >= 2 - #include - /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h. -@@ -121,7 +124,7 @@ call_ ## FUNC (void) \ - #if defined(OBJECT_FORMAT_ELF) \ - && !defined(OBJECT_FORMAT_FLAT) \ - && defined(HAVE_LD_EH_FRAME_HDR) \ -- && !defined(CRTSTUFFT_O) \ -+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ - && defined(inhibit_libc) \ - && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__)) - /* On systems using glibc, an inhibit_libc build of libgcc is only -diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h -index 555c0fe..47c8655 100644 ---- a/libgcc/gthr-posix.h -+++ b/libgcc/gthr-posix.h -@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define __GTHREADS 1 - #define __GTHREADS_CXX0X 1 - -+/* The following should normally be in a different header file, -+ * but I couldn't find the right location. The point of the macro -+ * definition below is to prevent libsupc++ and libstdc++ to reference -+ * weak symbols in their static C++ constructors. Such code crashes -+ * when a shared object linked statically to these libraries is -+ * loaded on Android 2.1 (Eclair) and older platform releases, due -+ * to a dynamic linker bug. -+ */ -+#ifdef __ANDROID__ -+#undef GTHREAD_USE_WEAK -+#define GTHREAD_USE_WEAK 0 -+#endif -+ - #include - - #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \ -diff --git a/libgcc/gthr.h b/libgcc/gthr.h -index 47a7d06..67a680f 100644 ---- a/libgcc/gthr.h -+++ b/libgcc/gthr.h -@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see - #define GTHREAD_USE_WEAK 1 - #endif - #endif -+#if __ANDROID__ -+#include "gthr-posix.h" -+#else - #include "gthr-default.h" -+#endif - - #ifndef HIDE_EXPORTS - #pragma GCC visibility pop -diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure -index 41797a9..5a631dd 100755 ---- a/libstdc++-v3/configure -+++ b/libstdc++-v3/configure -@@ -78319,6 +78319,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext - /* end confdefs.h. */ - #include - int lk; -+#if !defined(SYS_gettid) -+#define SYS_gettid __NR_gettid -+#endif -+#if !defined(SYS_futex) -+#define SYS_futex __NR_futex -+#endif - int - main () - { -@@ -78377,6 +78383,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext - /* end confdefs.h. */ - #include - int lk; -+#if !defined(SYS_gettid) -+#define SYS_gettid __NR_gettid -+#endif -+#if !defined(SYS_futex) -+#define SYS_futex __NR_futex -+#endif - int - main () - { -diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h -index e3e206b..e85dc2c 100644 ---- a/libstdc++-v3/include/bits/locale_facets.h -+++ b/libstdc++-v3/include/bits/locale_facets.h -@@ -47,6 +47,20 @@ - #include - #include - -+#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__ -+// CrystaX: for some reason, x86 gcc-4.9 makes ctype::do_widen() and -+// ctype::_M_widen_init() methods working wrong if optimization enabled. -+// For ctype::do_widen(), values of passed arguments (__lo, __hi and __to) -+// are completely messed up and don't correspond to passed values. In case if -+// we disable optimization for those methods, things become correct so we apply -+// this workaround here for a time. -+// TODO: figure out what exactly wrong here - is it bug in GCC optimization -+// algorithm or smth else? -+#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0))) -+#else -+#define __CRYSTAX_X86_DONT_OPTIMIZE -+#endif -+ - namespace std _GLIBCXX_VISIBILITY(default) - { - _GLIBCXX_BEGIN_NAMESPACE_VERSION -@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION - * @return @a __hi. - */ - virtual const char* -- do_widen(const char* __lo, const char* __hi, char_type* __to) const -+ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE - { - __builtin_memcpy(__to, __lo, __hi - __lo); - return __hi; -@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION - - private: - void _M_narrow_init() const; -- void _M_widen_init() const; -+ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE; - }; - - #ifdef _GLIBCXX_USE_WCHAR_T -diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc -index 9b61799..c149169 100644 ---- a/libstdc++-v3/libsupc++/guard.cc -+++ b/libstdc++-v3/libsupc++/guard.cc -@@ -33,7 +33,12 @@ - #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \ - && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX) - # include -+#if defined(__ANDROID__) -+# include -+# define SYS_futex __NR_futex -+#else - # include -+#endif - # include - # define _GLIBCXX_USE_FUTEX - # define _GLIBCXX_FUTEX_WAIT 0 diff --git a/patches/gcc/linaro-6.3-2017.02/970-crystax.patch b/patches/gcc/linaro-6.3-2017.02/970-crystax.patch new file mode 100644 index 0000000..e3109cc --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/970-crystax.patch @@ -0,0 +1,551 @@ +commit 080803512c8f6f87c2f1f711170d54033144d628 +Author: Dmitry Moskalchuk +Date: Wed Jul 29 11:28:29 2015 +0300 + + [android] Apply Android-related modifications + + Signed-off-by: Dmitry Moskalchuk + +[Edited: keep libstdc++, drop libcrystax-related modifications] +diff --git a/gcc/config.gcc b/gcc/config.gcc +index f66e48cd1..1c253496b 100644 +--- a/gcc/config.gcc ++++ b/gcc/config.gcc +@@ -942,13 +942,17 @@ aarch64*-*-elf | aarch64*-*-rtems*) + TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'` + ;; + aarch64*-*-linux*) +- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h" ++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h" + tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h" ++ extra_options="${extra_options} linux-android.opt" + tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux" + case $target in + aarch64_be-*) + tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1" + ;; ++ aarch64*-*-linux-android*) ++ tm_file="${tm_file} aarch64/aarch64-linux-android.h" ++ ;; + esac + aarch64_multilibs="${with_multilib_list}" + if test "$aarch64_multilibs" = "default"; then +@@ -2055,6 +2059,17 @@ mips*-*-linux*) # Linux MIPS, either endian. + tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h" + extra_options="${extra_options} linux-android.opt" + case ${target} in ++ mips64*android*) ++ default_mips_arch=mips64r6 ++ default_mips_abi=64 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="${tmake_file} mips/t-linux-android64" ++ ;; ++ mips*android*) ++ default_mips_arch=mips32 ++ tm_file="${tm_file} mips/android.h" ++ tmake_file="$tmake_file mips/t-linux-android" ++ ;; + mipsisa32r6*) + default_mips_arch=mips32r6 + ;; +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +new file mode 100644 +index 000000000..db1288fd0 +--- /dev/null ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -0,0 +1,59 @@ ++/* Machine description for AArch64 architecture. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ GCC is distributed in the hope that it will be useful, but ++ WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING3. If not see ++ . */ ++ ++#ifndef GCC_AARCH64_LINUX_ANDROID_H ++#define GCC_AARCH64_LINUX_ANDROID_H ++ ++ ++#undef TARGET_OS_CPP_BUILTINS ++#define TARGET_OS_CPP_BUILTINS() \ ++ do \ ++ { \ ++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ ++ } \ ++ while (0) ++ ++#undef LINK_SPEC ++#define LINK_SPEC \ ++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \ ++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC) ++ ++#undef CC1_SPEC ++#define CC1_SPEC \ ++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) ++ ++#undef LIB_SPEC ++#define LIB_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ ++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC) ++ ++#undef STARTFILE_SPEC ++#define STARTFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) ++ ++#undef ENDFILE_SPEC ++#define ENDFILE_SPEC \ ++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) ++ ++#endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h +index 5fcaa59a3..6864195ee 100644 +--- a/gcc/config/aarch64/aarch64-linux.h ++++ b/gcc/config/aarch64/aarch64-linux.h +@@ -21,7 +21,14 @@ + #ifndef GCC_AARCH64_LINUX_H + #define GCC_AARCH64_LINUX_H + +-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifndef RUNTIME_ROOT_PREFIX ++#define RUNTIME_ROOT_PREFIX "" ++#endif ++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" ++#ifdef BIONIC_DYNAMIC_LINKER ++#undef BIONIC_DYNAMIC_LINKER ++#endif ++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64" + + #undef MUSL_DYNAMIC_LINKER + #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1" +diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h +index ad123dde9..97b059de6 100644 +--- a/gcc/config/arm/arm.h ++++ b/gcc/config/arm/arm.h +@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes + + #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \ + || (TARGET_THUMB1 \ ++ && !inline_thumb1_jump_table \ + && (optimize_size || flag_pic))) + + #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \ +- (TARGET_THUMB1 \ ++ (TARGET_THUMB1 && !inline_thumb1_jump_table \ + ? (min >= 0 && max < 512 \ + ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \ + : min >= -256 && max < 256 \ +diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md +index 47171b996..eb22d1181 100644 +--- a/gcc/config/arm/arm.md ++++ b/gcc/config/arm/arm.md +@@ -8179,7 +8179,7 @@ + (match_operand:SI 2 "const_int_operand" "") ; total range + (match_operand:SI 3 "" "") ; table label + (match_operand:SI 4 "" "")] ; Out of range label +- "TARGET_32BIT || optimize_size || flag_pic" ++ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)" + " + { + enum insn_code code; +diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt +index 0ebe01743..772453889 100644 +--- a/gcc/config/arm/arm.opt ++++ b/gcc/config/arm/arm.opt +@@ -193,6 +193,10 @@ mthumb-interwork + Target Report Mask(INTERWORK) + Support calls between Thumb and ARM instruction sets. + ++minline-thumb1-jumptable ++Target Report Var(inline_thumb1_jump_table) ++Inline Thumb1 Jump table code ++ + mtls-dialect= + Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU) + Specify thread local storage scheme. +diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h +index 77f30554d..32158ed65 100644 +--- a/gcc/config/arm/elf.h ++++ b/gcc/config/arm/elf.h +@@ -56,8 +56,7 @@ + #undef SUBSUBTARGET_EXTRA_SPECS + #define SUBSUBTARGET_EXTRA_SPECS + +-#ifndef ASM_SPEC +-#define ASM_SPEC "\ ++#define DEFAULT_ASM_SPEC "\ + %{mbig-endian:-EB} \ + %{mlittle-endian:-EL} \ + %(asm_cpu_spec) \ +@@ -66,6 +65,9 @@ + %{mthumb-interwork:-mthumb-interwork} \ + %{mfloat-abi=*} %{mfpu=*} \ + %(subtarget_extra_asm_spec)" ++ ++#ifndef ASM_SPEC ++#define ASM_SPEC DEFAULT_ASM_SPEC + #endif + + /* The ARM uses @ are a comment character so we need to redefine +@@ -104,7 +106,8 @@ + the code more efficient, but for Thumb-1 it's better to put them out of + band unless we are generating compressed tables. */ + #define JUMP_TABLES_IN_TEXT_SECTION \ +- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) ++ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \ ++ && (optimize_size || flag_pic))) + + #ifndef LINK_SPEC + #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" +diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h +index ace84816e..8c8fa6553 100644 +--- a/gcc/config/arm/linux-eabi.h ++++ b/gcc/config/arm/linux-eabi.h +@@ -108,11 +108,16 @@ + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \ + GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \ +- ANDROID_CC1_SPEC) ++ ANDROID_CC1_SPEC("-fpic")) + + #define CC1PLUS_SPEC \ + LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + ++#undef ASM_SPEC ++#define ASM_SPEC \ ++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \ ++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC) ++ + #undef LIB_SPEC + #define LIB_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \ +diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h +index fee33a3ef..22fb2ced9 100644 +--- a/gcc/config/i386/gnu-user.h ++++ b/gcc/config/i386/gnu-user.h +@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see + When the -shared link option is used a final link is not being + done. */ + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ " -mssse3 -fno-short-enums " \ ++ + #undef ASM_SPEC + #define ASM_SPEC \ +- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}" ++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + #undef SUBTARGET_EXTRA_SPECS + #define SUBTARGET_EXTRA_SPECS \ +diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h +index 7a02a7eb4..cac4179bc 100644 +--- a/gcc/config/i386/gnu-user64.h ++++ b/gcc/config/i386/gnu-user64.h +@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define SPEC_X32 "mx32" + #endif + ++#undef ANDROID_TARGET_CC1_SPEC ++#define ANDROID_TARGET_CC1_SPEC \ ++ "%{m32:-mssse3 -fno-short-enums}" \ ++ "%{!m32:-msse4.2 -mpopcnt}" ++ + #undef ASM_SPEC + #define ASM_SPEC "%{" SPEC_32 ":--32} \ + %{" SPEC_64 ":--64} \ +diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h +index 4b9910fa9..3b11ed086 100644 +--- a/gcc/config/i386/linux-common.h ++++ b/gcc/config/i386/linux-common.h +@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see + #undef CC1_SPEC + #define CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC \ ++ ANDROID_TARGET_CC1_SPEC \ ++ " " \ ++ ANDROID_CC1_SPEC("-fPIC")) ++ ++#define CC1PLUS_SPEC \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC) + + #undef LINK_SPEC + #define LINK_SPEC \ +diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h +index 301a41ccd..9623c88d0 100644 +--- a/gcc/config/linux-android.h ++++ b/gcc/config/linux-android.h +@@ -38,15 +39,18 @@ + "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}" + + #define ANDROID_LINK_SPEC \ +- "%{shared: -Bsymbolic}" ++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now" + +-#define ANDROID_CC1_SPEC \ ++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \ + "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \ +- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}" ++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}" + + #define ANDROID_CC1PLUS_SPEC \ +- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \ +- "%{!frtti:%{!fno-rtti: -fno-rtti}}" ++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \ ++ "%{!frtti:%{!fno-rtti: -frtti}}" ++ ++#define ANDROID_ASM_SPEC \ ++ "--noexecstack" + + #define ANDROID_LIB_SPEC \ + "%{!static: -ldl}" +diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h +new file mode 100644 +index 000000000..32c539c8d +--- /dev/null ++++ b/gcc/config/mips/android.h +@@ -0,0 +1,49 @@ ++/* Target macros for mips*-*android* targets. ++ Copyright (C) 2014 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; either version 3, or (at your option) ++any later version. ++ ++GCC is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#undef DRIVER_SELF_SPECS ++#define DRIVER_SELF_SPECS \ ++ /* Make sure a -mips option is present. This helps us to pick \ ++ the right multilib, and also makes the later specs easier \ ++ to write. */ \ ++ MIPS_ISA_LEVEL_SPEC, \ ++ \ ++ /* Infer the default float setting from -march. */ \ ++ MIPS_ARCH_FLOAT_SPEC, \ ++ \ ++ /* Infer the -msynci setting from -march if not explicitly set. */ \ ++ MIPS_ISA_SYNCI_SPEC, \ ++ \ ++ /* If no ABI option is specified, infer one from the ISA level \ ++ or -mgp setting. */ \ ++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \ ++ \ ++ /* If no FP ABI option is specified, infer one from the \ ++ ABI/ISA level unless there is a conflicting option. */ \ ++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \ ++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \ ++ \ ++ /* If no odd-spreg option is specified, infer one from the ISA. */ \ ++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \ ++ \ ++ /* Base SPECs. */ \ ++ BASE_DRIVER_SELF_SPECS, \ ++ \ ++ /* Use the standard linux specs for everything else. */ \ ++ LINUX_DRIVER_SELF_SPECS +diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h +index 15b549c08..4a2816014 100644 +--- a/gcc/config/mips/gnu-user.h ++++ b/gcc/config/mips/gnu-user.h +@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see + /* The GNU C++ standard library requires this. */ \ + if (c_dialect_cxx ()) \ + builtin_define ("_GNU_SOURCE"); \ ++ ANDROID_TARGET_OS_CPP_BUILTINS(); \ + } while (0) + + #undef SUBTARGET_CPP_SPEC +@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see + + #undef SUBTARGET_ASM_SPEC + #define SUBTARGET_ASM_SPEC \ +- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}" ++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \ ++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC) + + /* The MIPS assembler has different syntax for .set. We set it to + .dummy to trap any errors. */ +@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); + #endif + + #define LINUX_DRIVER_SELF_SPECS \ +- NO_SHARED_SPECS \ ++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \ + MARCH_MTUNE_NATIVE_SPECS, \ + /* -mplt has no effect without -mno-shared. Simplify later \ + specs handling by removing a redundant option. */ \ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8429a7ca2..8bfacf994 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see + #undef SUBTARGET_CC1_SPEC + #define SUBTARGET_CC1_SPEC \ + LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \ +- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC) ++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic")) + + #undef CC1PLUS_SPEC + #define CC1PLUS_SPEC \ +diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android +new file mode 100644 +index 000000000..39f512c81 +--- /dev/null ++++ b/gcc/config/mips/t-linux-android +@@ -0,0 +1,3 @@ ++MULTILIB_OPTIONS = mips32r2/mips32r6 ++MULTILIB_DIRNAMES = mips-r2 mips-r6 ++MULTILIB_OSDIRNAMES = ../libr2 ../libr6 +diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64 +new file mode 100644 +index 000000000..55cab7d62 +--- /dev/null ++++ b/gcc/config/mips/t-linux-android64 +@@ -0,0 +1,4 @@ ++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6 ++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6 ++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64 ++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6 +diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h +index 555c0fe24..47c8655f9 100644 +--- a/libgcc/gthr-posix.h ++++ b/libgcc/gthr-posix.h +@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define __GTHREADS 1 + #define __GTHREADS_CXX0X 1 + ++/* The following should normally be in a different header file, ++ * but I couldn't find the right location. The point of the macro ++ * definition below is to prevent libsupc++ and libstdc++ to reference ++ * weak symbols in their static C++ constructors. Such code crashes ++ * when a shared object linked statically to these libraries is ++ * loaded on Android 2.1 (Eclair) and older platform releases, due ++ * to a dynamic linker bug. ++ */ ++#ifdef __ANDROID__ ++#undef GTHREAD_USE_WEAK ++#define GTHREAD_USE_WEAK 0 ++#endif ++ + #include + + #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \ +diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure +index 41797a971..f746e8353 100755 +--- a/libstdc++-v3/configure ++++ b/libstdc++-v3/configure +@@ -78319,6 +78341,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +@@ -78377,6 +78405,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include + int lk; ++#if !defined(SYS_gettid) ++#define SYS_gettid __NR_gettid ++#endif ++#if !defined(SYS_futex) ++#define SYS_futex __NR_futex ++#endif + int + main () + { +diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h +index e3e206b7d..e85dc2c76 100644 +--- a/libstdc++-v3/include/bits/locale_facets.h ++++ b/libstdc++-v3/include/bits/locale_facets.h +@@ -47,6 +47,20 @@ + #include + #include + ++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__ ++// CrystaX: for some reason, x86 gcc-4.9 makes ctype::do_widen() and ++// ctype::_M_widen_init() methods working wrong if optimization enabled. ++// For ctype::do_widen(), values of passed arguments (__lo, __hi and __to) ++// are completely messed up and don't correspond to passed values. In case if ++// we disable optimization for those methods, things become correct so we apply ++// this workaround here for a time. ++// TODO: figure out what exactly wrong here - is it bug in GCC optimization ++// algorithm or smth else? ++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0))) ++#else ++#define __CRYSTAX_X86_DONT_OPTIMIZE ++#endif ++ + namespace std _GLIBCXX_VISIBILITY(default) + { + _GLIBCXX_BEGIN_NAMESPACE_VERSION +@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + * @return @a __hi. + */ + virtual const char* +- do_widen(const char* __lo, const char* __hi, char_type* __to) const ++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE + { + __builtin_memcpy(__to, __lo, __hi - __lo); + return __hi; +@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION + + private: + void _M_narrow_init() const; +- void _M_widen_init() const; ++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE; + }; + + #ifdef _GLIBCXX_USE_WCHAR_T +diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc +index 9b617998f..c149169bb 100644 +--- a/libstdc++-v3/libsupc++/guard.cc ++++ b/libstdc++-v3/libsupc++/guard.cc +@@ -33,7 +33,12 @@ + #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \ + && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX) + # include ++#if defined(__ANDROID__) ++# include ++# define SYS_futex __NR_futex ++#else + # include ++#endif + # include + # define _GLIBCXX_USE_FUTEX + # define _GLIBCXX_FUTEX_WAIT 0 diff --git a/patches/gcc/linaro-6.3-2017.02/971-crystax.patch b/patches/gcc/linaro-6.3-2017.02/971-crystax.patch new file mode 100644 index 0000000..748a381 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/971-crystax.patch @@ -0,0 +1,25 @@ +commit 9f057b62caafe08c968103d39b5df82486a175c2 +Author: Dmitry Moskalchuk +Date: Thu Aug 13 16:11:54 2015 +0300 + + [android] Add additional multilib option: mfloat-abi=hard + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi +index 8f1307c55..cbbec5bd2 100644 +--- a/gcc/config/arm/t-linux-androideabi ++++ b/gcc/config/arm/t-linux-androideabi +@@ -1,8 +1,9 @@ +-MULTILIB_OPTIONS = march=armv7-a mthumb +-MULTILIB_DIRNAMES = armv7-a thumb +-MULTILIB_EXCEPTIONS = ++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard ++MULTILIB_DIRNAMES = armv7-a thumb hard ++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard* + MULTILIB_MATCHES = + MULTILIB_OSDIRNAMES = ++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch + + # The "special" multilib can be used to build native applications for Android, + # as opposed to native shared libraries that are then called via JNI. diff --git a/patches/gcc/linaro-6.3-2017.02/972-crystax.patch b/patches/gcc/linaro-6.3-2017.02/972-crystax.patch new file mode 100644 index 0000000..b9077be --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/972-crystax.patch @@ -0,0 +1,302 @@ +commit 44a81ebb7698dac41ffa7acd5e0cc1578e5ab1fd +Author: H.J. Lu +Date: Mon Apr 14 15:59:47 2014 -0700 + + [android] Always enable --eh-frame-hdr for static executable + + See 5e6cdf76af295c9a39b695ca228cff675e8ff4ae and + 23e3137ee2897464b051599b85a09f130d3ad05d + + Change-Id: Ibda473188e5a10f2a0592f2494ad00ad1f91e04b + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config.in b/gcc/config.in +index 115cb6163..933916833 100644 +--- a/gcc/config.in ++++ b/gcc/config.in +@@ -2119,6 +2119,12 @@ + #endif + + ++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */ ++#ifndef USED_FOR_TARGET ++#undef USE_EH_FRAME_HDR_FOR_STATIC ++#endif ++ ++ + /* Define to 1 if the 'long long' type is wider than 'long' but still + efficiently supported by the host hardware. */ + #ifndef USED_FOR_TARGET +diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h +index 093c38bba..54b3e0c91 100644 +--- a/gcc/config/alpha/elf.h ++++ b/gcc/config/alpha/elf.h +@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number; + I imagine that other systems will catch up. In the meantime, it + doesn't harm to make sure that the data exists to be used later. */ + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif +diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h +index 5ded869d2..5f51ac81d 100644 +--- a/gcc/config/freebsd.h ++++ b/gcc/config/freebsd.h +@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see + #define LIB_SPEC FBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #ifdef TARGET_LIBC_PROVIDES_SSP + #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \ +diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h +index b0bf40a95..d1874bc29 100644 +--- a/gcc/config/gnu-user.h ++++ b/gcc/config/gnu-user.h +@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LINK_GCC_C_SEQUENCE_SPEC + #define LINK_GCC_C_SEQUENCE_SPEC \ +diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h +index 37ecfc43f..a5f1b9955 100644 +--- a/gcc/config/openbsd.h ++++ b/gcc/config/openbsd.h +@@ -136,8 +136,12 @@ while (0) + #define LIB_SPEC OBSD_LIB_SPEC + + #if defined(HAVE_LD_EH_FRAME_HDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " + #endif ++#endif + + #undef LIB_SPEC + #define LIB_SPEC OBSD_LIB_SPEC +diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h +index cbf909722..eb2217fad 100644 +--- a/gcc/config/rs6000/sysv4.h ++++ b/gcc/config/rs6000/sysv4.h +@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN) + -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}" + + #if defined(HAVE_LD_EH_FRAME_HDR) +-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# ifdef USE_EH_FRAME_HDR_FOR_STATIC ++# define LINK_EH_SPEC "--eh-frame-hdr " ++# else ++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++# endif + #endif + + #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \ +diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h +index 5160e1fda..7632a5081 100644 +--- a/gcc/config/sol2.h ++++ b/gcc/config/sol2.h +@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see + /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs + --eh-frame-hdr to create the required .eh_frame_hdr sections. */ + #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) ++#ifdef USE_EH_FRAME_HDR_FOR_STATIC ++#define LINK_EH_SPEC "--eh-frame-hdr " ++#else + #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} " ++#endif + #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */ + #endif + +diff --git a/gcc/configure b/gcc/configure +index 1c6e3407c..28ad05004 100755 +--- a/gcc/configure ++++ b/gcc/configure +@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769 + enable_fix_cortex_a53_843419 + with_glibc_version + enable_gnu_unique_object ++enable_eh_frame_hdr_for_static + enable_linker_build_id + enable_default_ssp + with_long_double_128 +@@ -1670,6 +1671,9 @@ Optional Features: + --enable-gnu-unique-object + enable the use of the @gnu_unique_object ELF + extension on glibc systems ++ --enable-eh-frame-hdr-for-static ++ enable linker PT_GNU_EH_FRAME support for static ++ executable + --enable-linker-build-id + compiler will always pass --build-id to linker + --enable-default-ssp enable Stack Smashing Protection as default +@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + + $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h + ++ # Check whether --enable-eh-frame-hdr-for-static was given. ++if test "${enable_eh_frame_hdr_for_static+set}" = set; then : ++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;; ++ esac ++else ++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi ++fi ++ ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ ++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h ++ ++ fi + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5 + $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; } +diff --git a/gcc/configure.ac b/gcc/configure.ac +index 6c1dcd9ae..0cf7419e7 100644 +--- a/gcc/configure.ac ++++ b/gcc/configure.ac +@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR]) + if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then + AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1, + [Define if your linker supports .eh_frame_hdr.]) ++ AC_ARG_ENABLE(eh-frame-hdr-for-static, ++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static], ++ [enable linker PT_GNU_EH_FRAME support for static executable])], ++ [case $enable_eh_frame_hdr_for_static in ++ yes | no) ;; ++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid ++value for --enable-eh-frame-hdr-for-static. ++Valid choices are 'yes' and 'no'.]) ;; ++ esac], ++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from ++# Linux kernel. ++ [[if test x$host = x$build -a x$host = x$target && ++ ldd --version 2>&1 >/dev/null && ++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then ++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"` ++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"` ++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor` ++ if test "$glibcnum" -ge 2003 ; then ++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null` ++ if echo "$auvx" | grep AT_PHDR > /dev/null && ++ echo "$auvx" | grep AT_PHNUM > /dev/null; then ++ enable_eh_frame_hdr_for_static=yes ++ fi ++ fi ++ fi]]) ++ if test x$enable_eh_frame_hdr_for_static = xyes; then ++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1, ++[Define if your system supports PT_GNU_EH_FRAME for static executable.]) ++ fi + fi + AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr) + +diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C +new file mode 100644 +index 000000000..15408effa +--- /dev/null ++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C +@@ -0,0 +1,25 @@ ++// PR c++/4381 ++// Test that exception-specs work properly for classes with virtual bases. ++ ++// { dg-do run } ++// { dg-options "-static" } ++ ++class Base {}; ++ ++struct A : virtual public Base ++{ ++ A() {} ++}; ++ ++struct B {}; ++ ++void func() throw (B,A) ++{ ++ throw A(); ++} ++ ++int main(void) ++{ ++ try { func(); } ++ catch (A& a) { } ++} +diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c +index f3343fc4f..d42647779 100644 +--- a/libgcc/crtstuff.c ++++ b/libgcc/crtstuff.c +@@ -88,7 +88,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(BSD_DL_ITERATE_PHDR_AVAILABLE) + #include + # define USE_PT_GNU_EH_FRAME +@@ -97,7 +98,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__sun__) && defined(__svr4__) + #include + # define USE_PT_GNU_EH_FRAME +@@ -106,7 +108,8 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \ ++ && !defined(inhibit_libc) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(__GLIBC__) && __GLIBC__ >= 2 + #include + /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h. +@@ -121,7 +124,7 @@ call_ ## FUNC (void) \ + #if defined(OBJECT_FORMAT_ELF) \ + && !defined(OBJECT_FORMAT_FLAT) \ + && defined(HAVE_LD_EH_FRAME_HDR) \ +- && !defined(CRTSTUFFT_O) \ ++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \ + && defined(inhibit_libc) \ + && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__)) + /* On systems using glibc, an inhibit_libc build of libgcc is only diff --git a/patches/gcc/linaro-6.3-2017.02/973-crystax.patch b/patches/gcc/linaro-6.3-2017.02/973-crystax.patch new file mode 100644 index 0000000..b96ece3 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/973-crystax.patch @@ -0,0 +1,20 @@ +commit 778a9ef107f51544d583f110e92b75f4d9d79117 +Author: Dmitry Moskalchuk +Date: Thu Aug 20 19:11:07 2015 +0300 + + [android] Don't use PIE copyrelocs for x86/x86_64 + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c +index 3d044e8bd..5c89fcab0 100644 +--- a/gcc/config/i386/i386.c ++++ b/gcc/config/i386/i386.c +@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp) + else if (!SYMBOL_REF_FAR_ADDR_P (op0) + && (SYMBOL_REF_LOCAL_P (op0) + || (HAVE_LD_PIE_COPYRELOC ++ && !TARGET_HAS_BIONIC + && flag_pie + && !SYMBOL_REF_WEAK (op0) + && !SYMBOL_REF_FUNCTION_P (op0))) diff --git a/patches/gcc/linaro-6.3-2017.02/974-crystax.patch b/patches/gcc/linaro-6.3-2017.02/974-crystax.patch new file mode 100644 index 0000000..9db4f54 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/974-crystax.patch @@ -0,0 +1,24 @@ +commit dbeae1190cabad83999f2540523f045acc1bb4ec +Author: Dmitry Moskalchuk +Date: Fri Aug 21 17:41:59 2015 +0300 + + [android] Always use gthr-posix.h instead of gthr-default.h + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/libgcc/gthr.h b/libgcc/gthr.h +index 47a7d061a..67a680f90 100644 +--- a/libgcc/gthr.h ++++ b/libgcc/gthr.h +@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + #define GTHREAD_USE_WEAK 1 + #endif + #endif ++#if __ANDROID__ ++#include "gthr-posix.h" ++#else + #include "gthr-default.h" ++#endif + + #ifndef HIDE_EXPORTS + #pragma GCC visibility pop diff --git a/patches/gcc/linaro-6.3-2017.02/975-crystax.patch b/patches/gcc/linaro-6.3-2017.02/975-crystax.patch new file mode 100644 index 0000000..9efc2a4 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/975-crystax.patch @@ -0,0 +1,31 @@ +commit 8a66d422721ae5999737d7825701ff22097d287b +Author: Andrew Hsieh +Date: Mon Apr 14 21:05:51 2014 -0700 + + [android] Fix ARM generates insufficient alignment for NEON vst/vld + + See d909af3e2469aad87d5c3e79b93c778fd26c03a9 + + Change-Id: Ie1de9f946f397196bb6f1623f5add86933739484 + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c +index 5974c65d3..71b2c7aa9 100644 +--- a/gcc/config/arm/arm.c ++++ b/gcc/config/arm/arm.c +@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code) + memsize = MEM_SIZE (x); + + /* Only certain alignment specifiers are supported by the hardware. */ +- if (memsize == 32 && (align % 32) == 0) ++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC ++ honors stricter alignment of composite type in user code, it doesn't ++ observe the alignment of memory passed as an extra argument for function ++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */ ++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED) + align_bits = 256; +- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0) ++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED) + align_bits = 128; + else if (memsize >= 8 && (align % 8) == 0) + align_bits = 64; diff --git a/patches/gcc/linaro-6.3-2017.02/976-crystax.patch b/patches/gcc/linaro-6.3-2017.02/976-crystax.patch new file mode 100644 index 0000000..790d4a9 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/976-crystax.patch @@ -0,0 +1,21 @@ +commit 89d27bc45ee7325dcfff6748da0f8b9c1dc1f234 +Author: Dmitry Moskalchuk +Date: Sat Aug 22 09:55:55 2015 +0300 + + [android][i386] Remove throw() declaration from posix_memalign() proto + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h +index a1f98d3d1..3725799be 100644 +--- a/gcc/config/i386/pmm_malloc.h ++++ b/gcc/config/i386/pmm_malloc.h +@@ -31,7 +31,7 @@ + #ifndef __cplusplus + extern int posix_memalign (void **, size_t, size_t); + #else +-extern "C" int posix_memalign (void **, size_t, size_t) throw (); ++extern "C" int posix_memalign (void **, size_t, size_t); + #endif + + static __inline void * diff --git a/patches/gcc/linaro-6.3-2017.02/977-crystax.patch b/patches/gcc/linaro-6.3-2017.02/977-crystax.patch new file mode 100644 index 0000000..0211d72 --- /dev/null +++ b/patches/gcc/linaro-6.3-2017.02/977-crystax.patch @@ -0,0 +1,33 @@ +commit 9ae82f7cfc1073820092dd9f957559667e77db0d +Author: Dmitry Moskalchuk +Date: Tue Aug 25 09:36:42 2015 +0300 + + [android] Explicitly make _Unwind_Resume visible for arm64/mips64 + + Signed-off-by: Dmitry Moskalchuk + +diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h +index db1288fd0..38bc64d61 100644 +--- a/gcc/config/aarch64/aarch64-linux-android.h ++++ b/gcc/config/aarch64/aarch64-linux-android.h +@@ -57,4 +57,8 @@ + #define ENDFILE_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) + ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif ++ + #endif /* GCC_AARCH64_LINUX_ANDROID_H */ +diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h +index 8bfacf994..262a9a341 100644 +--- a/gcc/config/mips/linux-common.h ++++ b/gcc/config/mips/linux-common.h +@@ -63,3 +63,7 @@ along with GCC; see the file COPYING3. If not see + + /* The default value isn't sufficient in 64-bit mode. */ + #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024) ++ ++#ifdef IN_LIBGCC2 ++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default"))) ++#endif -- cgit v0.10.2-6-g49f6 From dab327ed06800bf6609aa1b2e370e37e87ce9e80 Mon Sep 17 00:00:00 2001 From: Alexey Neyman Date: Sun, 14 May 2017 12:26:59 -0700 Subject: Allow rebuilding aarch64-unknown-linux-android Otherwise, fails when checking if the destination dir is writable. Signed-off-by: Alexey Neyman diff --git a/samples/aarch64-unknown-linux-android/crosstool.config b/samples/aarch64-unknown-linux-android/crosstool.config index fa147e2..339819c 100644 --- a/samples/aarch64-unknown-linux-android/crosstool.config +++ b/samples/aarch64-unknown-linux-android/crosstool.config @@ -1,4 +1,3 @@ -# CT_RM_RF_PREFIX_DIR is not set CT_ARCH_arm=y CT_ARCH_64=y CT_ARCH_ARCH="armv8-a" -- cgit v0.10.2-6-g49f6 From 8e7947cb35a24e8c021848a0951314cc1c8851f7 Mon Sep 17 00:00:00 2001 From: Alexey Neyman Date: Sun, 14 May 2017 12:27:53 -0700 Subject: Print gcc version in the same way as the rest Signed-off-by: Alexey Neyman diff --git a/scripts/showSamples.sh b/scripts/showSamples.sh index f3a21d2..e60e29f 100755 --- a/scripts/showSamples.sh +++ b/scripts/showSamples.sh @@ -101,7 +101,7 @@ dump_single_sample() { cc=$(echo ${CT_CC} | ${awk} '{ print toupper($0)}') version=$(eval echo \${CT_CC_${cc}_VERSION}) compiler=$(echo $cc | ${awk} '{print tolower($0)}') - printf " $compiler | $version" + printf " $compiler-$version" printf "\n" printf " %-*s : %s" ${width} "Languages" "C" [ "${CT_CC_LANG_CXX}" = "y" ] && printf ",C++" -- cgit v0.10.2-6-g49f6 From f296faa8c21e46e5a1f49d0cecc6e5f0e6624438 Mon Sep 17 00:00:00 2001 From: Alexey Neyman Date: Sun, 14 May 2017 12:28:46 -0700 Subject: Add arm_neon.h as a TODO item Signed-off-by: Alexey Neyman diff --git a/TODO b/TODO index 89138ff..58694c2 100644 --- a/TODO +++ b/TODO @@ -2,6 +2,7 @@ A (slightly) ordered set of tasks for crosstool-NG. Written in a cryptic languag -- Alexey Neyman (@stilor) +[ ] arm_neon.h - offer as a companion "library" for the target [ ] FreeBSD [ ] Use 'cc' rather than 'gcc' on the host [ ] Detect in configure what the default value is -- cgit v0.10.2-6-g49f6 From 24b3fca91064cd25953d06c8a240fbb9ccc21fc7 Mon Sep 17 00:00:00 2001 From: Alexey Neyman Date: Sun, 14 May 2017 15:22:35 -0700 Subject: Mark bionic EXPERIMENTAL and dependent on GCC6+ Signed-off-by: Alexey Neyman diff --git a/config/libc/bionic.in b/config/libc/bionic.in index b6b7af5..9e04130 100644 --- a/config/libc/bionic.in +++ b/config/libc/bionic.in @@ -2,6 +2,8 @@ ## depends on ! WINDOWS && ! BARE_METAL ## depends on ARCH_arm || ARCH_mips || ARCH_x86 +## depends on EXPERIMENTAL +## depends on CC_GCC_6_or_later ## ## select LIBC_SUPPORT_THREADS_POSIX ## -- cgit v0.10.2-6-g49f6 From 97a20eed5c3cf3a2f7bb261705405b8b16c56b36 Mon Sep 17 00:00:00 2001 From: Alexey Neyman Date: Sun, 14 May 2017 18:46:34 -0700 Subject: Disallow duma/ltrace/strace for bionic Allow cross-gdb and gdbserver. This requires removal of an explicit check that disables gdbserver on android. However, the comment above that check refers to exec_elf.h, which has been removed since API level 19. It builds fine with current selection of the sample (21). Only build-tested, hope someone can give it a try and report back. We can fine tune the check for GDB but I'll leave it for now. Signed-off-by: Alexey Neyman diff --git a/config/debug/duma.in b/config/debug/duma.in index 170a694..f9e727b 100644 --- a/config/debug/duma.in +++ b/config/debug/duma.in @@ -1,6 +1,7 @@ # D.U.M.A. - Detect Unintended Memory Access - Memory checker ## depends on ! BARE_METAL +## depends on ! LIBC_bionic ## help D.U.M.A. - Detect Unintended Memory Access ## help A memory bound checker, with additional features. diff --git a/config/debug/gdb.in.native b/config/debug/gdb.in.native index e856b5d..faee8c3 100644 --- a/config/debug/gdb.in.native +++ b/config/debug/gdb.in.native @@ -4,6 +4,7 @@ config GDB_NATIVE bool prompt "Native gdb" depends on ! BARE_METAL + depends on ! LIBC_bionic select EXPAT_TARGET select NCURSES_TARGET help diff --git a/config/debug/ltrace.in b/config/debug/ltrace.in index 4c62676..fab6b81 100644 --- a/config/debug/ltrace.in +++ b/config/debug/ltrace.in @@ -1,6 +1,7 @@ # ltrace ## select LIBELF_TARGET +## depends on ! LIBC_bionic ## ## help ltrace is a program that simply runs the specified command until it exits. ## help It intercepts and records the dynamic library calls which are called by diff --git a/config/debug/strace.in b/config/debug/strace.in index 38dd96f..e77702b 100644 --- a/config/debug/strace.in +++ b/config/debug/strace.in @@ -1,5 +1,7 @@ # strace +## depends on ! LIBC_bionic + choice bool prompt "strace version" diff --git a/patches/gdb/7.12.1/200-allow-android.patch b/patches/gdb/7.12.1/200-allow-android.patch new file mode 100644 index 0000000..7954477 --- /dev/null +++ b/patches/gdb/7.12.1/200-allow-android.patch @@ -0,0 +1,42 @@ +diff -urpN gdb-7.12.1.orig/gdb/gdbserver/configure gdb-7.12.1/gdb/gdbserver/configure +--- gdb-7.12.1.orig/gdb/gdbserver/configure 2017-05-14 17:02:46.742711695 -0700 ++++ gdb-7.12.1/gdb/gdbserver/configure 2017-05-14 17:03:22.147058607 -0700 +@@ -6671,17 +6671,6 @@ fi + + + case "${target}" in +- *-android*) +- # Starting with NDK version 9, actually includes definitions +- # of Elf32_auxv_t and Elf64_auxv_t. But sadly, includes +- # which defines some of the ELF types incorrectly, +- # leading to conflicts with the defintions from . +- # This makes it impossible for us to include both and +- # , which means that, in practice, we do not have +- # access to Elf32_auxv_t and Elf64_auxv_t on this platform. +- # Therefore, do not try to auto-detect availability, as it would +- # get it wrong on this platform. +- ;; + *) + ac_fn_c_check_type "$LINENO" "Elf32_auxv_t" "ac_cv_type_Elf32_auxv_t" "#include + +diff -urpN gdb-7.12.1.orig/gdb/gdbserver/configure.ac gdb-7.12.1/gdb/gdbserver/configure.ac +--- gdb-7.12.1.orig/gdb/gdbserver/configure.ac 2017-05-14 17:02:46.742711695 -0700 ++++ gdb-7.12.1/gdb/gdbserver/configure.ac 2017-05-14 17:03:53.219361720 -0700 +@@ -179,17 +179,6 @@ AC_CHECK_TYPES(socklen_t, [], [], + ]) + + case "${target}" in +- *-android*) +- # Starting with NDK version 9, actually includes definitions +- # of Elf32_auxv_t and Elf64_auxv_t. But sadly, includes +- # which defines some of the ELF types incorrectly, +- # leading to conflicts with the defintions from . +- # This makes it impossible for us to include both and +- # , which means that, in practice, we do not have +- # access to Elf32_auxv_t and Elf64_auxv_t on this platform. +- # Therefore, do not try to auto-detect availability, as it would +- # get it wrong on this platform. +- ;; + *) + AC_CHECK_TYPES([Elf32_auxv_t, Elf64_auxv_t], [], [], + #include diff --git a/samples/aarch64-unknown-linux-android/crosstool.config b/samples/aarch64-unknown-linux-android/crosstool.config index 339819c..295b219 100644 --- a/samples/aarch64-unknown-linux-android/crosstool.config +++ b/samples/aarch64-unknown-linux-android/crosstool.config @@ -1,3 +1,4 @@ +CT_EXPERIMENTAL=y CT_ARCH_arm=y CT_ARCH_64=y CT_ARCH_ARCH="armv8-a" @@ -6,4 +7,5 @@ CT_KERNEL_linux=y CT_LIBC_BIONIC_V_14b=y CT_ANDROID_API_21=y CT_CC_LANG_CXX=y +CT_DEBUG_gdb=y CT_GETTEXT=y -- cgit v0.10.2-6-g49f6