summaryrefslogtreecommitdiff
path: root/patches/gcc
diff options
context:
space:
mode:
authorAlexey Neyman <stilor@att.net>2017-05-14 19:24:37 (GMT)
committerAlexey Neyman <stilor@att.net>2017-05-14 22:23:34 (GMT)
commit131011493b7550b5ed4bdf241e8432aeae7694e8 (patch)
treee648006437fd070b04dd0223dc3a74e5bf83b3c9 /patches/gcc
parent22e4a3f4334ee74233e40e19f3a72187eed3bb91 (diff)
Split the patch from CrystaX to separate commits
... and edit the biggest chunk as in previous 950-bionic-android.patch. Also, drop arm_neon.h - it is a separate package, will be added separately. Signed-off-by: Alexey Neyman <stilor@att.net>
Diffstat (limited to 'patches/gcc')
-rw-r--r--patches/gcc/6.3.0/950-bionic-android.patch17570
-rw-r--r--patches/gcc/6.3.0/970-crystax.patch551
-rw-r--r--patches/gcc/6.3.0/971-crystax.patch25
-rw-r--r--patches/gcc/6.3.0/972-crystax.patch302
-rw-r--r--patches/gcc/6.3.0/973-crystax.patch20
-rw-r--r--patches/gcc/6.3.0/974-crystax.patch24
-rw-r--r--patches/gcc/6.3.0/975-crystax.patch31
-rw-r--r--patches/gcc/6.3.0/976-crystax.patch21
-rw-r--r--patches/gcc/6.3.0/977-crystax.patch33
-rw-r--r--patches/gcc/7.1.0/951-bionic-ndk.patch58
-rw-r--r--patches/gcc/7.1.0/952-bionic-errno.patch19
-rw-r--r--patches/gcc/7.1.0/970-crystax.patch553
-rw-r--r--patches/gcc/7.1.0/971-crystax.patch25
-rw-r--r--patches/gcc/7.1.0/972-crystax.patch302
-rw-r--r--patches/gcc/7.1.0/973-crystax.patch20
-rw-r--r--patches/gcc/7.1.0/974-crystax.patch24
-rw-r--r--patches/gcc/7.1.0/975-crystax.patch31
-rw-r--r--patches/gcc/7.1.0/976-crystax.patch21
-rw-r--r--patches/gcc/7.1.0/977-crystax.patch33
-rw-r--r--patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch17570
-rw-r--r--patches/gcc/linaro-6.3-2017.02/970-crystax.patch551
-rw-r--r--patches/gcc/linaro-6.3-2017.02/971-crystax.patch25
-rw-r--r--patches/gcc/linaro-6.3-2017.02/972-crystax.patch302
-rw-r--r--patches/gcc/linaro-6.3-2017.02/973-crystax.patch20
-rw-r--r--patches/gcc/linaro-6.3-2017.02/974-crystax.patch24
-rw-r--r--patches/gcc/linaro-6.3-2017.02/975-crystax.patch31
-rw-r--r--patches/gcc/linaro-6.3-2017.02/976-crystax.patch21
-rw-r--r--patches/gcc/linaro-6.3-2017.02/977-crystax.patch33
28 files changed, 3100 insertions, 35140 deletions
diff --git a/patches/gcc/6.3.0/950-bionic-android.patch b/patches/gcc/6.3.0/950-bionic-android.patch
deleted file mode 100644
index 1ae1186..0000000
--- a/patches/gcc/6.3.0/950-bionic-android.patch
+++ /dev/null
@@ -1,17570 +0,0 @@
-Based on https://github.com/crystax/android-toolchain-gcc-6/commits/master
-
-Up to commit e510dbdc79714512ac87126f1832f366d56623b6 but with
-Crystax-specific lib support omitted
-diff --git a/gcc/config.gcc b/gcc/config.gcc
-index f66e48c..4380657 100644
---- a/gcc/config.gcc
-+++ b/gcc/config.gcc
-@@ -374,6 +374,7 @@ i[34567]86-*-*)
- rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h
- adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h
- avx512cdintrin.h avx512erintrin.h avx512pfintrin.h
-+ arm_neon.h
- shaintrin.h clflushoptintrin.h xsavecintrin.h
- xsavesintrin.h avx512dqintrin.h avx512bwintrin.h
- avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h
-@@ -396,6 +397,7 @@ x86_64-*-*)
- rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h
- adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h
- avx512cdintrin.h avx512erintrin.h avx512pfintrin.h
-+ arm_neon.h
- shaintrin.h clflushoptintrin.h xsavecintrin.h
- xsavesintrin.h avx512dqintrin.h avx512bwintrin.h
- avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h
-@@ -942,13 +944,17 @@ aarch64*-*-elf | aarch64*-*-rtems*)
- TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
- ;;
- aarch64*-*-linux*)
-- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
-+ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h"
- tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
-+ extra_options="${extra_options} linux-android.opt"
- tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
- case $target in
- aarch64_be-*)
- tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"
- ;;
-+ aarch64*-*-linux-android*)
-+ tm_file="${tm_file} aarch64/aarch64-linux-android.h"
-+ ;;
- esac
- aarch64_multilibs="${with_multilib_list}"
- if test "$aarch64_multilibs" = "default"; then
-@@ -2055,6 +2061,17 @@ mips*-*-linux*) # Linux MIPS, either endian.
- tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h"
- extra_options="${extra_options} linux-android.opt"
- case ${target} in
-+ mips64*android*)
-+ default_mips_arch=mips64r6
-+ default_mips_abi=64
-+ tm_file="${tm_file} mips/android.h"
-+ tmake_file="${tmake_file} mips/t-linux-android64"
-+ ;;
-+ mips*android*)
-+ default_mips_arch=mips32
-+ tm_file="${tm_file} mips/android.h"
-+ tmake_file="$tmake_file mips/t-linux-android"
-+ ;;
- mipsisa32r6*)
- default_mips_arch=mips32r6
- ;;
-diff --git a/gcc/config.in b/gcc/config.in
-index 115cb61..9339168 100644
---- a/gcc/config.in
-+++ b/gcc/config.in
-@@ -2119,6 +2119,12 @@
- #endif
-
-
-+/* Define if your system supports PT_GNU_EH_FRAME for static executable. */
-+#ifndef USED_FOR_TARGET
-+#undef USE_EH_FRAME_HDR_FOR_STATIC
-+#endif
-+
-+
- /* Define to 1 if the 'long long' type is wider than 'long' but still
- efficiently supported by the host hardware. */
- #ifndef USED_FOR_TARGET
-diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
-new file mode 100644
-index 0000000..cffd84d
---- /dev/null
-+++ b/gcc/config/aarch64/aarch64-linux-android.h
-@@ -0,0 +1,63 @@
-+/* Machine description for AArch64 architecture.
-+ Copyright (C) 2014 Free Software Foundation, Inc.
-+
-+ This file is part of GCC.
-+
-+ GCC is free software; you can redistribute it and/or modify it
-+ under the terms of the GNU General Public License as published by
-+ the Free Software Foundation; either version 3, or (at your option)
-+ any later version.
-+
-+ GCC is distributed in the hope that it will be useful, but
-+ WITHOUT ANY WARRANTY; without even the implied warranty of
-+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ General Public License for more details.
-+
-+ You should have received a copy of the GNU General Public License
-+ along with GCC; see the file COPYING3. If not see
-+ <http://www.gnu.org/licenses/>. */
-+
-+#ifndef GCC_AARCH64_LINUX_ANDROID_H
-+#define GCC_AARCH64_LINUX_ANDROID_H
-+
-+
-+#undef TARGET_OS_CPP_BUILTINS
-+#define TARGET_OS_CPP_BUILTINS() \
-+ do \
-+ { \
-+ GNU_USER_TARGET_OS_CPP_BUILTINS(); \
-+ ANDROID_TARGET_OS_CPP_BUILTINS(); \
-+ } \
-+ while (0)
-+
-+#undef LINK_SPEC
-+#define LINK_SPEC \
-+ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \
-+ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC)
-+
-+#undef CC1_SPEC
-+#define CC1_SPEC \
-+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
-+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
-+
-+#define CC1PLUS_SPEC \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
-+
-+#undef LIB_SPEC
-+#define LIB_SPEC \
-+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
-+ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC)
-+
-+#undef STARTFILE_SPEC
-+#define STARTFILE_SPEC \
-+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC)
-+
-+#undef ENDFILE_SPEC
-+#define ENDFILE_SPEC \
-+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
-+
-+#ifdef IN_LIBGCC2
-+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
-+#endif
-+
-+#endif /* GCC_AARCH64_LINUX_ANDROID_H */
-diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
-index 5fcaa59..6864195 100644
---- a/gcc/config/aarch64/aarch64-linux.h
-+++ b/gcc/config/aarch64/aarch64-linux.h
-@@ -21,7 +21,14 @@
- #ifndef GCC_AARCH64_LINUX_H
- #define GCC_AARCH64_LINUX_H
-
--#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
-+#ifndef RUNTIME_ROOT_PREFIX
-+#define RUNTIME_ROOT_PREFIX ""
-+#endif
-+#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
-+#ifdef BIONIC_DYNAMIC_LINKER
-+#undef BIONIC_DYNAMIC_LINKER
-+#endif
-+#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64"
-
- #undef MUSL_DYNAMIC_LINKER
- #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
-diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h
-index 093c38b..54b3e0c 100644
---- a/gcc/config/alpha/elf.h
-+++ b/gcc/config/alpha/elf.h
-@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number;
- I imagine that other systems will catch up. In the meantime, it
- doesn't harm to make sure that the data exists to be used later. */
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
-index 5974c65..71b2c7a 100644
---- a/gcc/config/arm/arm.c
-+++ b/gcc/config/arm/arm.c
-@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code)
- memsize = MEM_SIZE (x);
-
- /* Only certain alignment specifiers are supported by the hardware. */
-- if (memsize == 32 && (align % 32) == 0)
-+ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC
-+ honors stricter alignment of composite type in user code, it doesn't
-+ observe the alignment of memory passed as an extra argument for function
-+ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */
-+ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED)
- align_bits = 256;
-- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
-+ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED)
- align_bits = 128;
- else if (memsize >= 8 && (align % 8) == 0)
- align_bits = 64;
-diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
-index ad123dd..97b059d 100644
---- a/gcc/config/arm/arm.h
-+++ b/gcc/config/arm/arm.h
-@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes
-
- #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \
- || (TARGET_THUMB1 \
-+ && !inline_thumb1_jump_table \
- && (optimize_size || flag_pic)))
-
- #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \
-- (TARGET_THUMB1 \
-+ (TARGET_THUMB1 && !inline_thumb1_jump_table \
- ? (min >= 0 && max < 512 \
- ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \
- : min >= -256 && max < 256 \
-diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
-index 47171b9..eb22d11 100644
---- a/gcc/config/arm/arm.md
-+++ b/gcc/config/arm/arm.md
-@@ -8179,7 +8179,7 @@
- (match_operand:SI 2 "const_int_operand" "") ; total range
- (match_operand:SI 3 "" "") ; table label
- (match_operand:SI 4 "" "")] ; Out of range label
-- "TARGET_32BIT || optimize_size || flag_pic"
-+ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)"
- "
- {
- enum insn_code code;
-diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
-index 0ebe017..7724538 100644
---- a/gcc/config/arm/arm.opt
-+++ b/gcc/config/arm/arm.opt
-@@ -193,6 +193,10 @@ mthumb-interwork
- Target Report Mask(INTERWORK)
- Support calls between Thumb and ARM instruction sets.
-
-+minline-thumb1-jumptable
-+Target Report Var(inline_thumb1_jump_table)
-+Inline Thumb1 Jump table code
-+
- mtls-dialect=
- Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU)
- Specify thread local storage scheme.
-diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h
-index 77f3055..32158ed 100644
---- a/gcc/config/arm/elf.h
-+++ b/gcc/config/arm/elf.h
-@@ -56,8 +56,7 @@
- #undef SUBSUBTARGET_EXTRA_SPECS
- #define SUBSUBTARGET_EXTRA_SPECS
-
--#ifndef ASM_SPEC
--#define ASM_SPEC "\
-+#define DEFAULT_ASM_SPEC "\
- %{mbig-endian:-EB} \
- %{mlittle-endian:-EL} \
- %(asm_cpu_spec) \
-@@ -66,6 +65,9 @@
- %{mthumb-interwork:-mthumb-interwork} \
- %{mfloat-abi=*} %{mfpu=*} \
- %(subtarget_extra_asm_spec)"
-+
-+#ifndef ASM_SPEC
-+#define ASM_SPEC DEFAULT_ASM_SPEC
- #endif
-
- /* The ARM uses @ are a comment character so we need to redefine
-@@ -104,7 +106,8 @@
- the code more efficient, but for Thumb-1 it's better to put them out of
- band unless we are generating compressed tables. */
- #define JUMP_TABLES_IN_TEXT_SECTION \
-- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic)))
-+ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \
-+ && (optimize_size || flag_pic)))
-
- #ifndef LINK_SPEC
- #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X"
-diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
-index ace8481..ed7f4d0 100644
---- a/gcc/config/arm/linux-eabi.h
-+++ b/gcc/config/arm/linux-eabi.h
-@@ -108,11 +108,16 @@
- #define CC1_SPEC \
- LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \
- GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \
-- ANDROID_CC1_SPEC)
-+ ANDROID_CC1_SPEC("-fpic"))
-
- #define CC1PLUS_SPEC \
- LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
-
-+#undef ASM_SPEC
-+#define ASM_SPEC \
-+ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \
-+ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC)
-+
- #undef LIB_SPEC
- #define LIB_SPEC \
- LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
-diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi
-index 8f1307c..cbbec5b 100644
---- a/gcc/config/arm/t-linux-androideabi
-+++ b/gcc/config/arm/t-linux-androideabi
-@@ -1,8 +1,9 @@
--MULTILIB_OPTIONS = march=armv7-a mthumb
--MULTILIB_DIRNAMES = armv7-a thumb
--MULTILIB_EXCEPTIONS =
-+MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard
-+MULTILIB_DIRNAMES = armv7-a thumb hard
-+MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard*
- MULTILIB_MATCHES =
- MULTILIB_OSDIRNAMES =
-+MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch
-
- # The "special" multilib can be used to build native applications for Android,
- # as opposed to native shared libraries that are then called via JNI.
-diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h
-index 5ded869..5f51ac8 100644
---- a/gcc/config/freebsd.h
-+++ b/gcc/config/freebsd.h
-@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see
- #define LIB_SPEC FBSD_LIB_SPEC
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-
- #ifdef TARGET_LIBC_PROVIDES_SSP
- #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \
-diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h
-index b0bf40a..d1874bc 100644
---- a/gcc/config/gnu-user.h
-+++ b/gcc/config/gnu-user.h
-@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-
- #undef LINK_GCC_C_SEQUENCE_SPEC
- #define LINK_GCC_C_SEQUENCE_SPEC \
-diff --git a/gcc/config/i386/arm_neon.h b/gcc/config/i386/arm_neon.h
-new file mode 100644
-index 0000000..10944f3
---- /dev/null
-+++ b/gcc/config/i386/arm_neon.h
-@@ -0,0 +1,16622 @@
-+//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com
-+
-+//*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved.
-+
-+//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-+
-+//By downloading, copying, installing or using the software you agree to this license.
-+//If you do not agree to this license, do not download, install, copy or use the software.
-+
-+// License Agreement
-+
-+//Permission to use, copy, modify, and/or distribute this software for any
-+//purpose with or without fee is hereby granted, provided that the above
-+//copyright notice and this permission notice appear in all copies.
-+
-+//THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-+//REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
-+//AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-+//INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-+//LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
-+//OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-+//PERFORMANCE OF THIS SOFTWARE.
-+
-+//*****************************************************************************************
-+// This file is intended to simplify ARM->IA32 porting
-+// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
-+// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
-+// MMX instruction set is not used due to performance overhead and the necessity to use the
-+// EMMS instruction (_mm_empty())for mmx-x87 floating point switching
-+//*****************************************************************************************
-+
-+//!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
-+//!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for
-+//!!!!!!! greater performance. It can be done by -msse4.2 compiler switch.
-+
-+#ifndef NEON2SSE_H
-+#define NEON2SSE_H
-+
-+#ifndef USE_SSE4
-+#if defined(__SSE4_2__)
-+ #define USE_SSE4
-+#endif
-+#endif
-+
-+#include <xmmintrin.h> //SSE
-+#include <emmintrin.h> //SSE2
-+#include <pmmintrin.h> //SSE3
-+#include <tmmintrin.h> //SSSE3
-+#ifdef USE_SSE4
-+#include <smmintrin.h> //SSE4.1
-+#include <nmmintrin.h> //SSE4.2
-+#endif
-+
-+
-+//*************** functions and data attributes, compiler dependent *********************************
-+//***********************************************************************************
-+#ifdef __GNUC__
-+#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-+#define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
-+#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-+#if _GCC_VERSION < 40500
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
-+#else
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
-+#endif
-+#if defined(__x86_64__)
-+ #define _NEON2SSE_64BIT __x86_64__
-+#endif
-+#else
-+#define _NEON2SSE_ALIGN_16 __declspec(align(16))
-+#define _NEON2SSE_INLINE __inline
-+#if defined(_MSC_VER)|| defined (__INTEL_COMPILER)
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
-+#if defined(_M_X64)
-+ #define _NEON2SSE_64BIT _M_X64
-+#endif
-+#else
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
-+#endif
-+#endif
-+
-+#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
-+ #define _NEON2SSE_64BIT_SSE4
-+#endif
-+
-+/*********************************************************************************************************************/
-+// data types conversion
-+/*********************************************************************************************************************/
-+#if defined(_MSC_VER) && (_MSC_VER < 1300)
-+ typedef signed char int8_t;
-+ typedef unsigned char uint8_t;
-+ typedef signed short int16_t;
-+ typedef unsigned short uint16_t;
-+ typedef signed int int32_t;
-+ typedef unsigned int uint32_t;
-+ typedef signed long long int64_t;
-+ typedef unsigned long long uint64_t;
-+#elif defined(_MSC_VER)
-+ typedef signed __int8 int8_t;
-+ typedef unsigned __int8 uint8_t;
-+ typedef signed __int16 int16_t;
-+ typedef unsigned __int16 uint16_t;
-+ typedef signed __int32 int32_t;
-+ typedef unsigned __int32 uint32_t;
-+
-+ typedef signed long long int64_t;
-+ typedef unsigned long long uint64_t;
-+#else
-+#include <stdint.h>
-+#include <limits.h>
-+#endif
-+
-+typedef union __m64_128 {
-+ uint64_t m64_u64[1];
-+ float m64_f32[2];
-+ int8_t m64_i8[8];
-+ int16_t m64_i16[4];
-+ int32_t m64_i32[2];
-+ int64_t m64_i64[1];
-+ uint8_t m64_u8[8];
-+ uint16_t m64_u16[4];
-+ uint32_t m64_u32[2];
-+} __m64_128;
-+
-+typedef __m64_128 int8x8_t;
-+typedef __m64_128 uint8x8_t;
-+typedef __m64_128 int16x4_t;
-+typedef __m64_128 uint16x4_t;
-+typedef __m64_128 int32x2_t;
-+typedef __m64_128 uint32x2_t;
-+typedef __m64_128 int64x1_t;
-+typedef __m64_128 uint64x1_t;
-+typedef __m64_128 poly8x8_t;
-+typedef __m64_128 poly16x4_t;
-+
-+typedef __m64_128 float32x2_t;
-+typedef __m128 float32x4_t;
-+
-+typedef __m128 float16x4_t; //not supported by IA, for compatibility
-+typedef __m128 float16x8_t; //not supported by IA, for compatibility
-+
-+typedef __m128i int8x16_t;
-+typedef __m128i int16x8_t;
-+typedef __m128i int32x4_t;
-+typedef __m128i int64x2_t;
-+typedef __m128i uint8x16_t;
-+typedef __m128i uint16x8_t;
-+typedef __m128i uint32x4_t;
-+typedef __m128i uint64x2_t;
-+typedef __m128i poly8x16_t;
-+typedef __m128i poly16x8_t;
-+
-+#if defined(_MSC_VER)
-+ #define SINT_MIN (-2147483647 - 1) /* min signed int value */
-+ #define SINT_MAX 2147483647 /* max signed int value */
-+#else
-+ #define SINT_MIN INT_MIN /* min signed int value */
-+ #define SINT_MAX INT_MAX /* max signed int value */
-+#endif
-+
-+typedef float float32_t;
-+#if !defined(__clang__)
-+typedef float __fp16;
-+#endif
-+
-+typedef uint8_t poly8_t;
-+typedef uint16_t poly16_t;
-+
-+
-+//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
-+//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
-+struct int8x16x2_t {
-+ int8x16_t val[2];
-+};
-+struct int16x8x2_t {
-+ int16x8_t val[2];
-+};
-+struct int32x4x2_t {
-+ int32x4_t val[2];
-+};
-+struct int64x2x2_t {
-+ int64x2_t val[2];
-+};
-+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
-+struct int8x8x2_t {
-+ int8x8_t val[2];
-+};
-+struct int16x4x2_t {
-+ int16x4_t val[2];
-+};
-+struct int32x2x2_t {
-+ int32x2_t val[2];
-+};
-+struct int64x1x2_t {
-+ int64x1_t val[2];
-+};
-+
-+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
-+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
-+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
-+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
-+
-+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
-+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
-+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
-+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
-+
-+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
-+typedef struct int8x16x2_t uint8x16x2_t;
-+typedef struct int16x8x2_t uint16x8x2_t;
-+typedef struct int32x4x2_t uint32x4x2_t;
-+typedef struct int64x2x2_t uint64x2x2_t;
-+typedef struct int8x16x2_t poly8x16x2_t;
-+typedef struct int16x8x2_t poly16x8x2_t;
-+
-+typedef struct int8x8x2_t uint8x8x2_t;
-+typedef struct int16x4x2_t uint16x4x2_t;
-+typedef struct int32x2x2_t uint32x2x2_t;
-+typedef struct int64x1x2_t uint64x1x2_t;
-+typedef struct int8x8x2_t poly8x8x2_t;
-+typedef struct int16x4x2_t poly16x4x2_t;
-+
-+//float
-+struct float32x4x2_t {
-+ float32x4_t val[2];
-+};
-+struct float16x8x2_t {
-+ float16x8_t val[2];
-+};
-+struct float32x2x2_t {
-+ float32x2_t val[2];
-+};
-+
-+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
-+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
-+typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy
-+typedef float16x8x2_t float16x4x2_t;
-+
-+//4
-+struct int8x16x4_t {
-+ int8x16_t val[4];
-+};
-+struct int16x8x4_t {
-+ int16x8_t val[4];
-+};
-+struct int32x4x4_t {
-+ int32x4_t val[4];
-+};
-+struct int64x2x4_t {
-+ int64x2_t val[4];
-+};
-+
-+struct int8x8x4_t {
-+ int8x8_t val[4];
-+};
-+struct int16x4x4_t {
-+ int16x4_t val[4];
-+};
-+struct int32x2x4_t {
-+ int32x2_t val[4];
-+};
-+struct int64x1x4_t {
-+ int64x1_t val[4];
-+};
-+
-+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
-+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
-+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
-+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
-+
-+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
-+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
-+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
-+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
-+
-+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-+typedef struct int8x8x4_t uint8x8x4_t;
-+typedef struct int16x4x4_t uint16x4x4_t;
-+typedef struct int32x2x4_t uint32x2x4_t;
-+typedef struct int64x1x4_t uint64x1x4_t;
-+typedef struct int8x8x4_t poly8x8x4_t;
-+typedef struct int16x4x4_t poly16x4x4_t;
-+
-+typedef struct int8x16x4_t uint8x16x4_t;
-+typedef struct int16x8x4_t uint16x8x4_t;
-+typedef struct int32x4x4_t uint32x4x4_t;
-+typedef struct int64x2x4_t uint64x2x4_t;
-+typedef struct int8x16x4_t poly8x16x4_t;
-+typedef struct int16x8x4_t poly16x8x4_t;
-+
-+struct float32x4x4_t {
-+ float32x4_t val[4];
-+};
-+struct float16x8x4_t {
-+ float16x8_t val[4];
-+};
-+struct float32x2x4_t {
-+ float32x2_t val[4];
-+};
-+
-+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
-+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
-+typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy
-+typedef float16x8x4_t float16x4x4_t;
-+
-+//3
-+struct int16x8x3_t {
-+ int16x8_t val[3];
-+};
-+struct int32x4x3_t {
-+ int32x4_t val[3];
-+};
-+struct int64x2x3_t {
-+ int64x2_t val[3];
-+};
-+struct int8x16x3_t {
-+ int8x16_t val[3];
-+};
-+
-+struct int16x4x3_t {
-+ int16x4_t val[3];
-+};
-+struct int32x2x3_t {
-+ int32x2_t val[3];
-+};
-+struct int64x1x3_t {
-+ int64x1_t val[3];
-+};
-+struct int8x8x3_t {
-+ int8x8_t val[3];
-+};
-+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
-+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
-+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
-+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
-+
-+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
-+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
-+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
-+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
-+
-+
-+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-+typedef struct int8x16x3_t uint8x16x3_t;
-+typedef struct int16x8x3_t uint16x8x3_t;
-+typedef struct int32x4x3_t uint32x4x3_t;
-+typedef struct int64x2x3_t uint64x2x3_t;
-+typedef struct int8x16x3_t poly8x16x3_t;
-+typedef struct int16x8x3_t poly16x8x3_t;
-+typedef struct int8x8x3_t uint8x8x3_t;
-+typedef struct int16x4x3_t uint16x4x3_t;
-+typedef struct int32x2x3_t uint32x2x3_t;
-+typedef struct int64x1x3_t uint64x1x3_t;
-+typedef struct int8x8x3_t poly8x8x3_t;
-+typedef struct int16x4x3_t poly16x4x3_t;
-+
-+//float
-+struct float32x4x3_t {
-+ float32x4_t val[3];
-+};
-+struct float32x2x3_t {
-+ float32x2_t val[3];
-+};
-+struct float16x8x3_t {
-+ float16x8_t val[3];
-+};
-+
-+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
-+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
-+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
-+typedef float16x8x3_t float16x4x3_t;
-+
-+
-+//****************************************************************************
-+//****** Porting auxiliary macros ********************************************
-+
-+//** floating point related macros **
-+#define _M128i(a) _mm_castps_si128(a)
-+#define _M128(a) _mm_castsi128_ps(a)
-+//here the most performance effective implementation is compiler and 32/64 bits build dependent
-+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
-+
-+ #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
-+ #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
-+ #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
-+#else
-+ //for 32bit gcc and Microsoft compilers builds
-+ #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
-+ #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
-+ #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
-+#endif
-+#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
-+
-+#define return64(a) _M64(res64,a); return res64;
-+#define return64f(a) _M64f(res64,a); return res64;
-+
-+#define _Ui64(a) (*(uint64_t*)&(a))
-+#define _UNSIGNED_T(a) u ## a
-+
-+#define _SIGNBIT64 ((uint64_t)1 << 63)
-+#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6))
-+#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
-+
-+#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
-+#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
-+
-+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+#define __constrange(min,max) const
-+#define __transfersize(size)
-+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+
-+
-+//*************************************************************************
-+//*************************************************************************
-+//********* Functions declarations as declared in original arm_neon.h *****
-+//*************************************************************************
-+//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
-+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
-+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
-+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
-+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
-+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
-+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
-+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
-+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
-+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
-+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
-+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
-+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
-+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
-+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
-+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
-+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
-+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
-+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
-+//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
-+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
-+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
-+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
-+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
-+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
-+//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
-+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
-+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
-+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
-+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
-+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
-+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
-+//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
-+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
-+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
-+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
-+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
-+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
-+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
-+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
-+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
-+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
-+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
-+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
-+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
-+//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
-+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
-+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
-+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
-+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
-+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
-+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
-+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
-+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
-+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
-+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
-+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
-+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
-+//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
-+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
-+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
-+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
-+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
-+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
-+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
-+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
-+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
-+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
-+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
-+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
-+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
-+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
-+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
-+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
-+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
-+//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
-+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
-+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
-+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
-+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
-+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
-+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
-+//Vector rounding add high half: vraddhn
-+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
-+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
-+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
-+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
-+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
-+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
-+//Multiplication
-+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
-+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
-+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
-+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
-+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
-+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
-+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
-+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
-+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
-+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
-+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
-+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
-+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
-+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
-+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
-+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
-+//multiply lane
-+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
-+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
-+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
-+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
-+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
-+int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
-+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
-+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
-+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
-+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
-+//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
-+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
-+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
-+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
-+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
-+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
-+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
-+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
-+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
-+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
-+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
-+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
-+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
-+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
-+//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
-+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
-+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
-+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
-+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
-+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
-+//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
-+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
-+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
-+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
-+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
-+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
-+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
-+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
-+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
-+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
-+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
-+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
-+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
-+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
-+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
-+//Vector multiply subtract long
-+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
-+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
-+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
-+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
-+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
-+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
-+//Vector saturating doubling multiply high
-+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
-+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
-+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
-+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
-+//Vector saturating rounding doubling multiply high
-+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
-+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
-+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
-+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
-+//Vector saturating doubling multiply accumulate long
-+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
-+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
-+//Vector saturating doubling multiply subtract long
-+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
-+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
-+//Vector long multiply
-+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
-+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
-+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
-+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
-+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
-+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
-+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
-+//Vector saturating doubling long multiply
-+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
-+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
-+//Subtraction
-+//Vector subtract
-+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
-+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
-+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
-+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
-+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
-+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
-+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
-+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
-+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
-+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
-+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
-+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
-+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
-+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
-+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
-+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
-+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
-+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
-+//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
-+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
-+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
-+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
-+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
-+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
-+//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
-+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
-+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
-+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
-+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
-+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
-+//Vector saturating subtract
-+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
-+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
-+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
-+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
-+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
-+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
-+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
-+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
-+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
-+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
-+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
-+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
-+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
-+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
-+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
-+uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
-+//Vector halving subtract
-+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
-+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
-+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
-+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
-+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
-+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
-+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
-+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
-+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
-+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
-+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
-+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
-+//Vector subtract high half
-+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
-+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
-+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
-+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
-+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
-+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
-+//Vector rounding subtract high half
-+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+//Comparison
-+//Vector compare equal
-+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
-+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
-+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
-+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
-+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
-+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
-+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
-+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
-+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
-+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
-+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
-+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
-+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
-+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
-+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
-+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
-+//Vector compare greater-than or equal
-+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
-+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
-+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+//Vector compare less-than or equal
-+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
-+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
-+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+//Vector compare greater-than
-+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
-+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
-+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+//Vector compare less-than
-+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
-+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
-+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+//Vector compare absolute greater-than or equal
-+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+//Vector compare absolute less-than or equal
-+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+//Vector compare absolute greater-than
-+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+//Vector compare absolute less-than
-+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+//Vector test bits
-+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
-+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
-+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
-+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
-+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
-+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
-+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
-+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
-+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
-+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
-+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
-+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
-+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
-+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
-+//Absolute difference
-+//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
-+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
-+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
-+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
-+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
-+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
-+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
-+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
-+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
-+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
-+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
-+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
-+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
-+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
-+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
-+//Absolute difference - long
-+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
-+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
-+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
-+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
-+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
-+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
-+//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
-+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
-+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
-+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
-+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
-+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
-+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
-+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
-+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
-+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
-+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
-+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
-+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
-+//Absolute difference and accumulate - long
-+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
-+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
-+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
-+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
-+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
-+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
-+//Max/Min
-+//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
-+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
-+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
-+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
-+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
-+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
-+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
-+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
-+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
-+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
-+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
-+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
-+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
-+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
-+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
-+//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
-+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
-+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
-+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
-+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
-+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
-+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
-+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
-+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
-+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
-+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
-+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
-+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
-+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
-+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
-+//Pairwise addition
-+//Pairwise add
-+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
-+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
-+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
-+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
-+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
-+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
-+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
-+//Long pairwise add
-+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
-+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
-+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
-+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
-+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
-+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
-+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
-+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
-+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
-+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
-+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
-+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-+//Long pairwise add and accumulate
-+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
-+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
-+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
-+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
-+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
-+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
-+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
-+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
-+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
-+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
-+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
-+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
-+//Folding maximum vpmax -> takes maximum of adjacent pairs
-+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
-+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
-+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
-+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
-+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
-+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
-+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
-+//Folding minimum vpmin -> takes minimum of adjacent pairs
-+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
-+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
-+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
-+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
-+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
-+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
-+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
-+//Reciprocal/Sqrt
-+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
-+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
-+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
-+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
-+//Shifts by signed variable
-+//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
-+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
-+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
-+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
-+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
-+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
-+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
-+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
-+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
-+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
-+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
-+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
-+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
-+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
-+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
-+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
-+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
-+//Vector saturating shift left: (negative values shift right)
-+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
-+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
-+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
-+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
-+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
-+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
-+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
-+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
-+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
-+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
-+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
-+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
-+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
-+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
-+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
-+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
-+//Vector rounding shift left: (negative values shift right)
-+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
-+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
-+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
-+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
-+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
-+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
-+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
-+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
-+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
-+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
-+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
-+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
-+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
-+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
-+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
-+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
-+//Vector saturating rounding shift left: (negative values shift right)
-+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
-+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
-+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
-+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
-+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
-+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
-+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
-+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
-+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
-+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
-+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
-+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
-+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
-+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
-+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
-+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
-+//Shifts by a constant
-+//Vector shift right by constant
-+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
-+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
-+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
-+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
-+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
-+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
-+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
-+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
-+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
-+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
-+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
-+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
-+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
-+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
-+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
-+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
-+//Vector shift left by constant
-+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+//Vector rounding shift right by constant
-+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
-+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
-+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
-+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
-+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
-+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
-+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
-+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
-+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
-+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
-+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
-+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
-+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
-+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
-+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
-+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
-+//Vector shift right by constant and accumulate
-+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
-+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
-+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
-+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
-+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
-+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
-+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
-+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
-+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
-+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
-+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
-+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
-+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
-+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
-+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
-+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
-+//Vector rounding shift right by constant and accumulate
-+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
-+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
-+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
-+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
-+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
-+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
-+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
-+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
-+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
-+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
-+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
-+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
-+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
-+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
-+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
-+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
-+//Vector saturating shift left by constant
-+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
-+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
-+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
-+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
-+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
-+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
-+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
-+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
-+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
-+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
-+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
-+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
-+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
-+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
-+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
-+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
-+//Vector signed->unsigned saturating shift left by constant
-+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
-+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
-+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
-+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
-+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
-+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
-+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
-+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
-+//Vector narrowing shift right by constant
-+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+//Vector signed->unsigned narrowing saturating shift right by constant
-+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
-+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
-+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
-+//Vector signed->unsigned rounding narrowing saturating shift right by constant
-+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
-+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
-+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
-+//Vector narrowing saturating shift right by constant
-+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
-+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
-+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
-+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
-+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
-+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
-+//Vector rounding narrowing shift right by constant
-+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+//Vector rounding narrowing saturating shift right by constant
-+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
-+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
-+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
-+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
-+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
-+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
-+//Vector widening shift left by constant
-+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
-+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
-+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
-+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
-+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
-+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
-+//Shifts with insert
-+//Vector shift right and insert
-+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+//Vector shift left and insert
-+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
-+//Load a single vector from memory
-+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
-+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
-+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
-+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
-+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
-+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
-+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
-+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
-+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
-+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
-+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
-+//Load a single lane from memory
-+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
-+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
-+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
-+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
-+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
-+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
-+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
-+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
-+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
-+float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
-+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
-+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+//Load all lanes of vector with same value from memory
-+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+//Store a single vector or lane. Stores all lanes or a single lane of a vector.
-+//Store a single vector into memory
-+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
-+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
-+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
-+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
-+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
-+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
-+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
-+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
-+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
-+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
-+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
-+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
-+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
-+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
-+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
-+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
-+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
-+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
-+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
-+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
-+//Store a lane of a vector into memory
-+//Loads of an N-element structure
-+//Load N-element structure from memory
-+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
-+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
-+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+//Load all lanes of N-element structure with same value from memory
-+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+//Load a single lane of N-element structure from memory
-+//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
-+uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-+int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-+float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-+poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-+uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
-+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
-+//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+//Store N-element structure to memory
-+void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
-+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
-+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
-+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
-+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
-+void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
-+void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
-+//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
-+void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
-+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
-+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
-+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
-+void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
-+void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
-+void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
-+void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
-+void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
-+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
-+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
-+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
-+void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
-+void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
-+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
-+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
-+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+//Store a single lane of N-element structure to memory
-+void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
-+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
-+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
-+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
-+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-+void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-+void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
-+void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
-+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
-+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
-+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
-+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
-+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
-+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
-+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
-+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
-+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
-+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
-+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+//Initialize a vector from a literal bit pattern.
-+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
-+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
-+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
-+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
-+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
-+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
-+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
-+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
-+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
-+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
-+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
-+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
-+//Set all lanes to same value
-+//Load all lanes of vector to the same literal value
-+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
-+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
-+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
-+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
-+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
-+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
-+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
-+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
-+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
-+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
-+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
-+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
-+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
-+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
-+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
-+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
-+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
-+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
-+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
-+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
-+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
-+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
-+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
-+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
-+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
-+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
-+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
-+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
-+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+//Load all lanes of the vector to the value of a lane of a vector
-+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
-+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
-+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
-+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
-+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
-+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
-+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
-+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
-+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
-+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
-+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
-+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
-+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
-+//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
-+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
-+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
-+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
-+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
-+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
-+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
-+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
-+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
-+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
-+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
-+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
-+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
-+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
-+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
-+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
-+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
-+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
-+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
-+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
-+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
-+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
-+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
-+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
-+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
-+//Converting vectors. These intrinsics are used to convert vectors.
-+//Convert from float
-+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
-+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
-+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
-+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
-+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
-+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
-+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
-+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
-+//Convert to float
-+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
-+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
-+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
-+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
-+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
-+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
-+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
-+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
-+//Convert between floats
-+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
-+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
-+//Vector narrow integer
-+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
-+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
-+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
-+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
-+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
-+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
-+//Vector long move
-+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
-+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
-+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
-+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
-+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
-+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
-+//Vector saturating narrow integer
-+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
-+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
-+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
-+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
-+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
-+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
-+//Vector saturating narrow integer signed->unsigned
-+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
-+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
-+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
-+//Table look up
-+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
-+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+//Extended table look up intrinsics
-+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
-+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+//Operations with a scalar value
-+//Vector multiply accumulate with scalar
-+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
-+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
-+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
-+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
-+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
-+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
-+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
-+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
-+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
-+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
-+//Vector widening multiply accumulate with scalar
-+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
-+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
-+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
-+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
-+//Vector widening saturating doubling multiply accumulate with scalar
-+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
-+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
-+//Vector multiply subtract with scalar
-+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
-+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
-+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
-+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
-+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
-+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
-+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
-+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
-+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
-+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
-+//Vector widening multiply subtract with scalar
-+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
-+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
-+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
-+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
-+//Vector widening saturating doubling multiply subtract with scalar
-+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
-+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
-+//Vector multiply by scalar
-+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
-+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
-+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
-+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
-+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
-+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
-+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
-+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
-+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
-+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
-+//Vector long multiply with scalar
-+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
-+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
-+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
-+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
-+//Vector long multiply by scalar
-+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
-+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
-+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
-+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
-+//Vector saturating doubling long multiply with scalar
-+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
-+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
-+//Vector saturating doubling long multiply by scalar
-+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
-+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
-+//Vector saturating doubling multiply high with scalar
-+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
-+//Vector saturating doubling multiply high by scalar
-+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
-+//Vector saturating rounding doubling multiply high with scalar
-+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
-+//Vector rounding saturating doubling multiply high by scalar
-+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
-+//Vector multiply accumulate with scalar
-+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
-+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
-+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
-+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
-+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
-+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
-+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
-+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
-+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
-+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
-+//Vector widening multiply accumulate with scalar
-+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
-+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
-+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
-+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
-+//Vector widening saturating doubling multiply accumulate with scalar
-+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
-+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
-+//Vector multiply subtract with scalar
-+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
-+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
-+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
-+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
-+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
-+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
-+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
-+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
-+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
-+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
-+//Vector widening multiply subtract with scalar
-+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
-+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
-+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
-+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
-+//Vector widening saturating doubling multiply subtract with scalar
-+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
-+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
-+//Vector extract
-+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
-+//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
-+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
-+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
-+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
-+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
-+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
-+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
-+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
-+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
-+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
-+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
-+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
-+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
-+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
-+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
-+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
-+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
-+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
-+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
-+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
-+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
-+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
-+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
-+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
-+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
-+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
-+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
-+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
-+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
-+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
-+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
-+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
-+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
-+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
-+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
-+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
-+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
-+//Other single operand arithmetic
-+//Absolute: Vd[i] = |Va[i]|
-+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
-+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
-+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
-+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
-+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
-+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
-+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
-+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
-+//Saturating absolute: Vd[i] = sat(|Va[i]|)
-+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
-+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
-+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
-+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
-+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
-+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
-+//Negate: Vd[i] = - Va[i]
-+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
-+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
-+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
-+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
-+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
-+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
-+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
-+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
-+//Saturating Negate: sat(Vd[i] = - Va[i])
-+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
-+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
-+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
-+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
-+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
-+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
-+//Count leading sign bits
-+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
-+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
-+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
-+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
-+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
-+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
-+//Count leading zeros
-+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
-+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
-+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
-+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
-+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
-+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
-+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
-+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
-+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
-+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
-+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
-+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
-+//Count number of set bits
-+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
-+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
-+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
-+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
-+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
-+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
-+//Reciprocal estimate
-+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
-+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
-+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
-+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
-+//Reciprocal square root estimate
-+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
-+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
-+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
-+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
-+//Logical operations
-+//Bitwise not
-+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
-+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
-+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
-+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
-+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
-+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
-+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
-+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
-+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
-+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
-+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
-+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
-+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
-+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
-+//Bitwise and
-+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
-+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
-+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
-+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
-+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
-+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
-+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
-+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
-+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
-+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
-+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
-+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
-+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
-+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
-+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
-+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
-+//Bitwise or
-+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
-+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
-+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
-+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
-+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
-+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
-+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
-+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
-+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
-+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
-+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
-+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
-+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
-+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
-+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
-+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
-+//Bitwise exclusive or (EOR or XOR)
-+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
-+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
-+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
-+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
-+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
-+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
-+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
-+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
-+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
-+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
-+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
-+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
-+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
-+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
-+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
-+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
-+//Bit Clear
-+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
-+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
-+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
-+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
-+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
-+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
-+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
-+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
-+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
-+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
-+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
-+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
-+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
-+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
-+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
-+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
-+//Bitwise OR complement
-+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
-+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
-+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
-+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
-+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
-+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
-+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
-+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
-+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
-+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
-+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
-+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
-+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
-+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
-+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
-+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
-+//Bitwise Select
-+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
-+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
-+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
-+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
-+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
-+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
-+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
-+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
-+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
-+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
-+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
-+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
-+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
-+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
-+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
-+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
-+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
-+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
-+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
-+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
-+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
-+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
-+//Transposition operations
-+//Transpose elements
-+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
-+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
-+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
-+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
-+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
-+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
-+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
-+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
-+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
-+int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
-+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
-+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
-+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
-+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
-+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
-+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
-+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
-+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
-+//Interleave elements
-+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
-+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
-+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
-+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
-+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
-+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
-+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
-+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
-+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
-+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
-+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
-+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
-+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
-+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
-+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
-+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
-+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
-+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
-+//De-Interleave elements
-+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
-+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
-+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
-+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
-+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
-+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
-+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
-+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
-+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
-+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
-+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
-+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
-+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
-+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
-+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
-+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
-+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
-+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
-+
-+
-+//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-+// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
-+//for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
-+//
-+#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
-+
-+ #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
-+
-+ #define _MM_EXTRACT_EPI16 _mm_extract_epi16
-+ #define _MM_INSERT_EPI16 _mm_insert_epi16
-+#ifdef USE_SSE4
-+ #define _MM_EXTRACT_EPI8 _mm_extract_epi8
-+ #define _MM_EXTRACT_EPI32 _mm_extract_epi32
-+ #define _MM_EXTRACT_PS _mm_extract_ps
-+
-+ #define _MM_INSERT_EPI8 _mm_insert_epi8
-+ #define _MM_INSERT_EPI32 _mm_insert_epi32
-+ #define _MM_INSERT_PS _mm_insert_ps
-+#ifdef _NEON2SSE_64BIT
-+ #define _MM_INSERT_EPI64 _mm_insert_epi64
-+ #define _MM_EXTRACT_EPI64 _mm_extract_epi64
-+#endif
-+#endif //SSE4
-+#else
-+ #define _NEON2SSE_COMMA ,
-+ #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
-+ switch(LANE) \
-+ { \
-+ case 0: return NAME(a b, 0); \
-+ case 1: return NAME(a b, 1); \
-+ case 2: return NAME(a b, 2); \
-+ case 3: return NAME(a b, 3); \
-+ case 4: return NAME(a b, 4); \
-+ case 5: return NAME(a b, 5); \
-+ case 6: return NAME(a b, 6); \
-+ case 7: return NAME(a b, 7); \
-+ case 8: return NAME(a b, 8); \
-+ case 9: return NAME(a b, 9); \
-+ case 10: return NAME(a b, 10); \
-+ case 11: return NAME(a b, 11); \
-+ case 12: return NAME(a b, 12); \
-+ case 13: return NAME(a b, 13); \
-+ case 14: return NAME(a b, 14); \
-+ case 15: return NAME(a b, 15); \
-+ default: return NAME(a b, 0); \
-+ }
-+
-+ #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
-+ switch(LANE) \
-+ { \
-+ case 0: return NAME(vec p,0); \
-+ case 1: return NAME(vec p,1); \
-+ case 2: return NAME(vec p,2); \
-+ case 3: return NAME(vec p,3); \
-+ case 4: return NAME(vec p,4); \
-+ case 5: return NAME(vec p,5); \
-+ case 6: return NAME(vec p,6); \
-+ case 7: return NAME(vec p,7); \
-+ default: return NAME(vec p,0); \
-+ }
-+
-+ #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
-+ switch(LANE) \
-+ { \
-+ case case0: return NAME(vec p,case0); \
-+ case case1: return NAME(vec p,case1); \
-+ case case2: return NAME(vec p,case2); \
-+ case case3: return NAME(vec p,case3); \
-+ default: return NAME(vec p,case0); \
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
-+ {
-+ _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
-+ }
-+
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
-+ }
-+
-+#ifdef _NEON2SSE_64BIT
-+ //the special case of functions available only for SSE4 and 64-bit build.
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
-+ {
-+ switch(LANE) {
-+ case 0:
-+ return _mm_insert_epi64(vec, p, 0);
-+ case 1:
-+ return _mm_insert_epi64(vec, p, 1);
-+ default:
-+ return _mm_insert_epi64(vec, p, 0);
-+ }
-+ }
-+
-+ _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
-+ {
-+ if (LANE ==0) return _mm_extract_epi64(val, 0);
-+ else return _mm_extract_epi64(val, 1);
-+ }
-+#endif
-+
-+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
-+ }
-+
-+#endif //USE_SSE4
-+
-+#endif //#ifdef NDEBUG
-+
-+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
-+// or for some specific commonly used operations implementation missing in SSE
-+#ifdef USE_SSE4
-+ #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
-+ #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
-+ #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
-+
-+ #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
-+ #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
-+ #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
-+
-+ #define _MM_MAX_EPI8 _mm_max_epi8
-+ #define _MM_MAX_EPI32 _mm_max_epi32
-+ #define _MM_MAX_EPU16 _mm_max_epu16
-+ #define _MM_MAX_EPU32 _mm_max_epu32
-+
-+ #define _MM_MIN_EPI8 _mm_min_epi8
-+ #define _MM_MIN_EPI32 _mm_min_epi32
-+ #define _MM_MIN_EPU16 _mm_min_epu16
-+ #define _MM_MIN_EPU32 _mm_min_epu32
-+
-+ #define _MM_BLENDV_EPI8 _mm_blendv_epi8
-+ #define _MM_PACKUS_EPI32 _mm_packus_epi32
-+ #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
-+
-+ #define _MM_MULLO_EPI32 _mm_mullo_epi32
-+ #define _MM_MUL_EPI32 _mm_mul_epi32
-+
-+ #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
-+#else //no SSE4 !!!!!!
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ return _mm_unpacklo_epi8(a, zero);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ return _mm_unpacklo_epi16(a, zero);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ return _mm_unpacklo_epi32(a, zero);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ __m128i sign = _mm_cmpgt_epi8(zero, a);
-+ return _mm_unpacklo_epi8(a, sign);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ __m128i sign = _mm_cmpgt_epi16(zero, a);
-+ return _mm_unpacklo_epi16(a, sign);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ __m128i sign = _mm_cmpgt_epi32(zero, a);
-+ return _mm_unpacklo_epi32(a, sign);
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t tmp[4];
-+ _mm_store_si128((__m128i*)tmp, vec);
-+ return tmp[LANE];
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t tmp[16];
-+ _mm_store_si128((__m128i*)tmp, vec);
-+ return (int)tmp[LANE];
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t tmp[4];
-+ _mm_store_si128((__m128i*)tmp, _M128i(vec));
-+ return tmp[LANE];
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
-+ _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
-+ __m128i vec_masked, p_masked;
-+ pvec[LANE] = p;
-+ mask[LANE] = 0x0;
-+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-+ return _mm_or_si128(vec_masked, p_masked);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
-+ _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
-+ __m128i vec_masked, p_masked;
-+ pvec[LANE] = (int8_t)p;
-+ mask[LANE] = 0x0;
-+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-+ return _mm_or_si128(vec_masked, p_masked);
-+ }
-+
-+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
-+ __m128 tmp, vec_masked, p_masked;
-+ mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
-+ vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
-+ tmp = _mm_or_ps(vec_masked, p_masked);
-+ return tmp;
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi8 (a, b);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi32(a, b);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
-+ {
-+ __m128i c8000, b_s, a_s, cmp;
-+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
-+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
-+ b_s = _mm_sub_epi16 (b, c8000);
-+ a_s = _mm_sub_epi16 (a, c8000);
-+ cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
-+ {
-+ __m128i c80000000, b_s, a_s, cmp;
-+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
-+ b_s = _mm_sub_epi32 (b, c80000000);
-+ a_s = _mm_sub_epi32 (a, c80000000);
-+ cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi8 (b, a);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi32(b, a);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
-+ {
-+ __m128i c8000, b_s, a_s, cmp;
-+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
-+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
-+ b_s = _mm_sub_epi16 (b, c8000);
-+ a_s = _mm_sub_epi16 (a, c8000);
-+ cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
-+ {
-+ __m128i c80000000, b_s, a_s, cmp;
-+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
-+ b_s = _mm_sub_epi32 (b, c80000000);
-+ a_s = _mm_sub_epi32 (a, c80000000);
-+ cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
-+ {
-+ //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
-+ __m128i a_masked, b_masked;
-+ b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
-+ a_masked = _mm_andnot_si128 (mask,a);
-+ return _mm_or_si128(a_masked, b_masked);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
-+ __m128i a16, b16, res, reshi,cmp, zero;
-+ zero = _mm_setzero_si128();
-+ a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
-+ b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
-+ res = _mm_unpacklo_epi64(a16, b16); //result without saturation
-+ reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
-+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
-+ res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
-+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
-+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
-+ __m128i a16, res, reshi,cmp, zero;
-+ zero = _mm_setzero_si128();
-+ a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
-+ reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
-+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
-+ res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
-+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
-+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
-+ }
-+
-+
-+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
-+ int64_t res64;
-+ int i;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ for (i = 0; i<4; i++) {
-+ res64 = atmp[i] * btmp[i];
-+ res[i] = (int)(res64 & 0xffffffff);
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
-+ {
-+ __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg;
-+ sign = _mm_xor_si128 (a, b);
-+ sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
-+ zero = _mm_setzero_si128();
-+ a_neg = _mm_abs_epi32 (a); //negate a and b
-+ b_neg = _mm_abs_epi32 (b); //negate a and b
-+ mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
-+ mul_us_neg = _mm_sub_epi64(zero, mul_us);
-+ mul_us_neg = _mm_and_si128(sign, mul_us_neg);
-+ mul_us = _mm_andnot_si128(sign, mul_us);
-+ return _mm_or_si128 (mul_us, mul_us_neg);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
-+ {
-+ __m128i res;
-+ res = _mm_cmpeq_epi32 (a, b);
-+ return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
-+ }
-+#endif //SSE4
-+
-+//the special case of functions working only for 32 bits, no SSE4
-+_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
-+ _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
-+ __m128i vec_masked, p_masked;
-+ pvec[LANE] = p;
-+ mask[LANE] = 0x0;
-+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-+ return _mm_or_si128(vec_masked, p_masked);
-+}
-+
-+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
-+{
-+ _NEON2SSE_ALIGN_16 int64_t tmp[2];
-+ _mm_store_si128((__m128i*)tmp, val);
-+ return tmp[LANE];
-+}
-+
-+#ifndef _NEON2SSE_64BIT_SSE4
-+ #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
-+ #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
-+#endif
-+
-+int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints
-+_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a)
-+{
-+ //Overflow happens only if a and sum have the opposite signs
-+ __m128i c7fffffff, res, res_sat, res_xor_a;
-+ c7fffffff = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_slli_epi32 (a, 1); // res = a*2
-+ res_sat = _mm_srli_epi32(a, 31);
-+ res_sat = _mm_add_epi32(res_sat, c7fffffff);
-+ res_xor_a = _mm_xor_si128(res, a);
-+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-+ res_sat = _mm_and_si128(res_xor_a, res_sat);
-+ res = _mm_andnot_si128(res_xor_a, res);
-+ return _mm_or_si128(res, res_sat);
-+}
-+
-+
-+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+//*************************************************************************
-+//*************************************************************************
-+//***************** Functions redefinition\implementatin starts here *****
-+//*************************************************************************
-+//*************************************************************************
-+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+
-+/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
-+#ifdef ARM
-+#define vector_addq_s32 _mm_add_epi32
-+#else //if we have IA
-+#define vector_addq_s32 vadd_s32
-+#endif
-+
-+********************************************************************************************
-+Functions below are organised in the following way:
-+
-+Each NEON intrinsic function has one of the following options:
-+1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
-+2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
-+3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
-+4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
-+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
-+- please consider such functions removal from your code.
-+*/
-+
-+//***********************************************************************
-+//************************ Vector add *****************************
-+//***********************************************************************
-+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res64;
-+ res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
-+ return res64;
-+}
-+
-+
-+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
-+{
-+ __m128 res;
-+ __m64_128 res64;
-+ res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
-+#define vadd_u8 vadd_s8
-+
-+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
-+#define vadd_u16 vadd_s16
-+
-+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
-+#define vadd_u32 vadd_s32
-+
-+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
-+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b)
-+{
-+ uint64x1_t res64;
-+ res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
-+ return res64;
-+}
-+
-+
-+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
-+#define vaddq_s8 _mm_add_epi8
-+
-+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
-+#define vaddq_s16 _mm_add_epi16
-+
-+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
-+#define vaddq_s32 _mm_add_epi32
-+
-+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
-+#define vaddq_s64 _mm_add_epi64
-+
-+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
-+#define vaddq_f32 _mm_add_ps
-+
-+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
-+#define vaddq_u8 _mm_add_epi8
-+
-+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
-+#define vaddq_u16 _mm_add_epi16
-+
-+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
-+#define vaddq_u32 _mm_add_epi32
-+
-+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
-+#define vaddq_u64 _mm_add_epi64
-+
-+//**************************** Vector long add *****************************:
-+//***********************************************************************
-+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_add_epi16 (a16, b16);
-+}
-+
-+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi32 (a32, b32);
-+}
-+
-+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 ( a64, b64);
-+}
-+
-+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi16 (a16, b16);
-+}
-+
-+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi32 (a32, b32);
-+}
-+
-+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 (a64, b64);
-+}
-+
-+//*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
-+//*************** *********************************************************************
-+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
-+_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_add_epi16 (a, b16);
-+}
-+
-+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
-+_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
-+ return _mm_add_epi32 (a, b32);
-+}
-+
-+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
-+_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 (a, b64);
-+}
-+
-+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
-+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi16 (a, b16);
-+}
-+
-+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
-+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi32 (a, b32);
-+}
-+
-+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
-+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 (a, b64);
-+}
-+
-+//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *******************************
-+//*************************************************************************************************************************
-+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vhaddq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = vshrq_n_s8(tmp2,1);
-+ return _mm_add_epi8(tmp1,tmp2);
-+}
-+
-+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = _mm_srai_epi16(tmp2,1);
-+ return _mm_add_epi16(tmp1,tmp2);
-+}
-+
-+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = _mm_srai_epi32(tmp2,1);
-+ return _mm_add_epi32(tmp1,tmp2);
-+}
-+
-+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
-+{
-+ __m128i c1, sum, res;
-+ c1 = _mm_set1_epi8(1);
-+ sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
-+ res = _mm_xor_si128(a, b); //for rounding compensation
-+ res = _mm_and_si128(res,c1); //for rounding compensation
-+ return _mm_sub_epi8 (sum, res); //actual rounding compensation
-+}
-+
-+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
-+{
-+ __m128i sum, res;
-+ sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
-+ res = _mm_xor_si128(a, b); //for rounding compensation
-+ res = _mm_slli_epi16 (res,15); //shift left then back right to
-+ res = _mm_srli_epi16 (res,15); //get 1 or zero
-+ return _mm_sub_epi16 (sum, res); //actual rounding compensation
-+}
-+
-+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = _mm_srli_epi32(tmp2,1);
-+ return _mm_add_epi32(tmp1,tmp2);
-+}
-+
-+//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 ***************************
-+//*****************************************************************************************************************************
-+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
-+}
-+
-+
-+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
-+}
-+
-+
-+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
-+{
-+ //no signed average in x86 SIMD, go to unsigned
-+ __m128i c128, au, bu, sum;
-+ c128 = _mm_set1_epi8(0x80); //-128
-+ au = _mm_sub_epi8(a, c128); //add 128
-+ bu = _mm_sub_epi8(b, c128); //add 128
-+ sum = _mm_avg_epu8(au, bu);
-+ return _mm_add_epi8 (sum, c128); //sub 128
-+}
-+
-+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
-+{
-+ //no signed average in x86 SIMD, go to unsigned
-+ __m128i cx8000, au, bu, sum;
-+ cx8000 = _mm_set1_epi16(0x8000); // - 32768
-+ au = _mm_sub_epi16(a, cx8000); //add 32768
-+ bu = _mm_sub_epi16(b, cx8000); //add 32768
-+ sum = _mm_avg_epu16(au, bu);
-+ return _mm_add_epi16 (sum, cx8000); //sub 32768
-+}
-+
-+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b)
-+{
-+ //need to avoid overflow
-+ __m128i a2, b2, res, sum;
-+ a2 = _mm_srai_epi32(a,1); //a2=a/2;
-+ b2 = _mm_srai_epi32(b,1); // b2=b/2;
-+ res = _mm_or_si128(a,b); //for rounding
-+ res = _mm_slli_epi32 (res,31); //shift left then back right to
-+ res = _mm_srli_epi32 (res,31); //get 1 or zero
-+ sum = _mm_add_epi32(a2,b2);
-+ return _mm_add_epi32(sum,res);
-+}
-+
-+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
-+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
-+
-+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
-+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
-+
-+
-+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
-+{
-+ //need to avoid overflow
-+ __m128i a2, b2, res, sum;
-+ a2 = _mm_srli_epi32(a,1); //a2=a/2;
-+ b2 = _mm_srli_epi32(b,1); // b2=b/2;
-+ res = _mm_or_si128(a,b); //for rounding
-+ res = _mm_slli_epi32 (res,31); //shift left then back right to
-+ res = _mm_srli_epi32 (res,31); //get 1 or zero
-+ sum = _mm_add_epi32(a2,b2);
-+ return _mm_add_epi32(sum,res);
-+}
-+
-+//****************** VQADD: Vector saturating add ************************
-+//************************************************************************
-+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int64x1_t res;
-+ uint64_t a64, b64;
-+ a64 = a.m64_u64[0];
-+ b64 = b.m64_u64[0];
-+ res.m64_u64[0] = a64 + b64;
-+ a64 = (a64 >> 63) + (~_SIGNBIT64);
-+ if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
-+ res.m64_u64[0] = a64;
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t a64, b64;
-+ uint64x1_t res;
-+ a64 = a.m64_u64[0];
-+ b64 = b.m64_u64[0];
-+ res.m64_u64[0] = a64 + b64;
-+ if (res.m64_u64[0] < a64) {
-+ res.m64_u64[0] = ~(uint64_t)0;
-+ }
-+ return res;
-+}
-+
-+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
-+#define vqaddq_s8 _mm_adds_epi8
-+
-+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
-+#define vqaddq_s16 _mm_adds_epi16
-+
-+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b)
-+{
-+ //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
-+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
-+ c7fffffff = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_add_epi32(a, b);
-+ res_sat = _mm_srli_epi32(a, 31);
-+ res_sat = _mm_add_epi32(res_sat, c7fffffff);
-+ res_xor_a = _mm_xor_si128(res, a);
-+ b_xor_a_ = _mm_xor_si128(b, a);
-+ res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
-+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-+ res_sat = _mm_and_si128(res_xor_a, res_sat);
-+ res = _mm_andnot_si128(res_xor_a, res);
-+ return _mm_or_si128(res, res_sat);
-+}
-+
-+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = atmp[0] + btmp[0];
-+ res[1] = atmp[1] + btmp[1];
-+
-+ atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
-+ atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
-+
-+ if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
-+ res[0] = atmp[0];
-+ }
-+ if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
-+ res[1] = atmp[1];
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
-+#define vqaddq_u8 _mm_adds_epu8
-+
-+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
-+#define vqaddq_u16 _mm_adds_epu16
-+
-+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
-+{
-+ __m128i c80000000, cmp, subsum, suba, sum;
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+ sum = _mm_add_epi32 (a, b);
-+ subsum = _mm_sub_epi32 (sum, c80000000);
-+ suba = _mm_sub_epi32 (a, c80000000);
-+ cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
-+ return _mm_or_si128 (sum, cmp); //saturation
-+}
-+
-+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
-+ {
-+ __m128i c80000000, sum, cmp, suba, subsum;
-+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
-+ sum = _mm_add_epi64 (a, b);
-+ subsum = _mm_sub_epi64 (sum, c80000000);
-+ suba = _mm_sub_epi64 (a, c80000000);
-+ cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
-+ return _mm_or_si128 (sum, cmp); //saturation
-+ }
-+#else
-+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+ {
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = atmp[0] + btmp[0];
-+ res[1] = atmp[1] + btmp[1];
-+ if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
-+ if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
-+ return _mm_load_si128((__m128i*)(res));
-+ }
-+#endif
-+
-+
-+//******************* Vector add high half (truncated) ******************
-+//************************************************************************
-+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi16 (a, b);
-+ sum = _mm_srai_epi16 (sum, 8);
-+ sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
-+{
-+ int16x4_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi32 (a, b);
-+ sum = _mm_srai_epi32(sum, 16);
-+ sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi64 (a, b);
-+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6));
-+ return64(sum);
-+}
-+
-+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi16 (a, b);
-+ sum = _mm_srli_epi16 (sum, 8);
-+ sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
-+{
-+ uint16x4_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi32 (a, b);
-+ sum = _mm_srli_epi32 (sum, 16);
-+ sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
-+#define vaddhn_u64 vaddhn_s64
-+
-+//*********** Vector rounding add high half: vraddhn_<type> ******************.
-+//***************************************************************************
-+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sum = _mm_srai_epi16 (sum, 8); //get high half
-+ sum = _mm_add_epi16 (sum, mask1); //actual rounding
-+ sum = _mm_packs_epi16 (sum, sum);
-+ return64(sum);
-+}
-+
-+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int16x4_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sum = _mm_srai_epi32 (sum, 16); //get high half
-+ sum = _mm_add_epi32 (sum, mask1); //actual rounding
-+ sum = _mm_packs_epi32 (sum, sum);
-+ return64(sum);
-+}
-+
-+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int32x2_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi64 (a, b);
-+ mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
-+ sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
-+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6));
-+ return64(sum);
-+}
-+
-+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sum = _mm_srai_epi16 (sum, 8); //get high half
-+ sum = _mm_add_epi16 (sum, mask1); //actual rounding
-+ sum = _mm_packus_epi16 (sum, sum);
-+ return64(sum);
-+}
-+
-+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ uint16x4_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sum = _mm_srai_epi32 (sum, 16); //get high half
-+ sum = _mm_add_epi32 (sum, mask1); //actual rounding
-+ sum = _MM_PACKUS1_EPI32 (sum);
-+ return64(sum);
-+}
-+
-+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
-+#define vraddhn_u64 vraddhn_s64
-+
-+//**********************************************************************************
-+//********* Multiplication *************************************
-+//**************************************************************************************
-+
-+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-+//As we don't go to wider result functions are equal to "multiply low" in x86
-+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits in SSE
-+ int8x8_t res64;
-+ __m128i a128, b128, res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
-+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-+ res = _mm_mullo_epi16 (a128, b128);
-+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
-+ return64(res);
-+}
-+
-+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
-+#define vmul_s16 vmul_u16
-+
-+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
-+#define vmul_s32 vmul_u32
-+
-+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x4_t tmp;
-+ __m64_128 res64;
-+ tmp = _mm_mul_ps(_pM128(a),_pM128(b));
-+ _M64f(res64, tmp); //use low 64 bits
-+ return res64;
-+}
-+
-+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits in SSE
-+ uint8x8_t res64;
-+ __m128i mask, a128, b128, res;
-+ mask = _mm_set1_epi16(0xff);
-+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
-+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
-+ res = _mm_mullo_epi16 (a128, b128);
-+ res = _mm_and_si128(res, mask); //to avoid saturation
-+ res = _mm_packus_epi16 (res,res); //use only low 64 bits
-+ return64(res);
-+}
-+
-+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
-+ res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
-+ return res;
-+}
-+
-+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
-+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
-+{
-+ //may be optimized
-+ poly8x8_t res64;
-+ __m128i a64, b64, c1, res, tmp, bmasked;
-+ int i;
-+ a64 = _pM128i(a);
-+ b64 = _pM128i(b);
-+ c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
-+ c1 = vshrq_n_u8(c1,7); //0x1
-+ bmasked = _mm_and_si128(b64, c1); //0x1
-+ res = vmulq_u8(a64, bmasked);
-+ for(i = 1; i<8; i++) {
-+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-+ bmasked = _mm_and_si128(b64, c1); //0x1
-+ tmp = vmulq_u8(a64, bmasked);
-+ res = _mm_xor_si128(res, tmp);
-+ }
-+ return64 (res);
-+}
-+
-+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ //solution may be not optimal
-+ __m128i a16, b16, r16_1, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (a16, b16);
-+ //swap hi and low part of a and b to process the remaining data
-+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2
-+
-+ r16_2 = _mm_mullo_epi16 (a16, b16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
-+
-+ return _mm_unpacklo_epi64(r16_1, r16_2);
-+}
-+
-+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
-+#define vmulq_s16 _mm_mullo_epi16
-+
-+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
-+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
-+
-+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
-+#define vmulq_f32 _mm_mul_ps
-+
-+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ //solution may be not optimal
-+ __m128i maskff, a16, b16, r16_1, r16_2;
-+ maskff = _mm_set1_epi16(0xff);
-+ a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
-+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (a16, b16);
-+ r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
-+ //swap hi and low part of a and b to process the remaining data
-+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (a16, b16);
-+ r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
-+ return _mm_packus_epi16 (r16_1, r16_2);
-+}
-+
-+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
-+#define vmulq_u16 _mm_mullo_epi16
-+
-+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
-+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
-+
-+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
-+_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
-+{
-+ //may be optimized
-+ __m128i c1, res, tmp, bmasked;
-+ int i;
-+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
-+ c1 = vshrq_n_u8(c1,7); //0x1
-+ bmasked = _mm_and_si128(b, c1); //0x1
-+ res = vmulq_u8(a, bmasked);
-+ for(i = 1; i<8; i++) {
-+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-+ bmasked = _mm_and_si128(b, c1); //0x1
-+ tmp = vmulq_u8(a, bmasked);
-+ res = _mm_xor_si128(res, tmp);
-+ }
-+ return res;
-+}
-+
-+//************************* Vector long multiply ***********************************
-+//****************************************************************************
-+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
-+{
-+ //no 8 bit simd multiply, need to go to 16 bits
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
-+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
-+}
-+
-+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
-+{
-+ #ifdef USE_SSE4
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
-+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
-+ #else
-+ __m128i low, hi, a128,b128;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ low = _mm_mullo_epi16(a128,b128);
-+ hi = _mm_mulhi_epi16(a128,b128);
-+ return _mm_unpacklo_epi16(low,hi);
-+ #endif
-+}
-+
-+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
-+{
-+ __m128i ab, ba, a128, b128;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
-+ return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+}
-+
-+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
-+{
-+ //no 8 bit simd multiply, need to go to 16 bits
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
-+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
-+}
-+
-+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
-+{
-+ #ifdef USE_SSE4
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
-+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
-+ #else
-+ __m128i a128,b128,low, hi;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ low = _mm_mullo_epi16(a128,b128);
-+ hi = _mm_mulhi_epu16(a128,b128);
-+ return _mm_unpacklo_epi16(low,hi);
-+ #endif
-+}
-+
-+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
-+{
-+ ///may be not optimal compared with serial implementation
-+ __m128i ab, ba, a128, b128;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
-+ return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+}
-+
-+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
-+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
-+{
-+ //may be optimized
-+ __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
-+ int i;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
-+ c1 = vshrq_n_u8(c1,7); //0x1
-+ bmasked = _mm_and_si128(b128, c1); //0x1
-+
-+ a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
-+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
-+ res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
-+ for(i = 1; i<8; i++) {
-+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-+ bmasked = _mm_and_si128(b128, c1); //0x1
-+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
-+ tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
-+ res = _mm_xor_si128(res, tmp);
-+ }
-+ return res;
-+}
-+
-+//****************Vector saturating doubling long multiply **************************
-+//*****************************************************************
-+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
-+{
-+ //the serial soulution may be faster due to saturation
-+ __m128i res;
-+ res = vmull_s16(a, b);
-+ return vqd_s32(res);
-+}
-+
-+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //the serial soulution may be faster due to saturation
-+ __m128i res;
-+ res = vmull_s32(a,b);
-+ return vqaddq_s64(res,res); //slow serial function!!!!
-+}
-+
-+//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
-+//******************************************************************************************
-+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
-+{
-+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
-+ int8x8_t res64;
-+ __m128i b128, c128, res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-+ c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
-+ res = _mm_mullo_epi16 (c128, b128);
-+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
-+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ int16x4_t res64;
-+ return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
-+}
-+
-+
-+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
-+ res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
-+ return64(res);
-+}
-+
-+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
-+{
-+ //fma is coming soon, but right now:
-+ __m128 res;
-+ __m64_128 res64;
-+ res = _mm_mul_ps (_pM128(c), _pM128(b));
-+ res = _mm_add_ps (_pM128(a), res);
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
-+{
-+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
-+ uint8x8_t res64;
-+ __m128i mask, b128, c128, res;
-+ mask = _mm_set1_epi16(0xff);
-+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-+ c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
-+ res = _mm_mullo_epi16 (c128, b128);
-+ res = _mm_and_si128(res, mask); //to avoid saturation
-+ res = _mm_packus_epi16 (res, res);
-+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
-+ return64(res);
-+}
-+
-+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
-+#define vmla_u16 vmla_s16
-+
-+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
-+#define vmla_u32 vmla_s32
-+
-+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2,r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-+ r16_1 = _mm_add_epi8 (r16_1, a);
-+ //swap hi and low part of a, b and c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_add_epi8(r16_2, a_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_mullo_epi16 (c, b);
-+ return _mm_add_epi16 (res, a);
-+}
-+
-+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1
-+ return _mm_add_epi32 (res, a);
-+}
-+
-+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
-+{
-+ //fma is coming soon, but right now:
-+ __m128 res;
-+ res = _mm_mul_ps (c, b);
-+ return _mm_add_ps (a, res);
-+}
-+
-+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-+ r16_1 = _mm_add_epi8 (r16_1, a);
-+ //swap hi and low part of a, b and c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_add_epi8(r16_2, a_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
-+#define vmlaq_u16 vmlaq_s16
-+
-+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
-+#define vmlaq_u32 vmlaq_s32
-+
-+//********************** Vector widening multiply accumulate (long multiply accumulate):
-+// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
-+//********************************************************************************************
-+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
-+{
-+ int16x8_t res;
-+ res = vmull_s8(b, c);
-+ return _mm_add_epi16 (res, a);
-+}
-+
-+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int32x4_t res;
-+ res = vmull_s16(b, c);
-+ return _mm_add_epi32 (res, a);
-+}
-+
-+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_s32( b, c);
-+ return _mm_add_epi64 (res, a);
-+}
-+
-+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
-+{
-+ uint16x8_t res;
-+ res = vmull_u8(b, c);
-+ return _mm_add_epi16 (res, a);
-+}
-+
-+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ uint32x4_t res;
-+ res = vmull_u16(b, c);
-+ return _mm_add_epi32 (res, a);
-+}
-+
-+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_u32( b,c);
-+ return _mm_add_epi64 (res, a);
-+}
-+
-+//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
-+//********************************************************************************************
-+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
-+ int8x8_t res64;
-+ __m128i res;
-+ res64 = vmul_s8(b,c);
-+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
-+ return64(res);
-+}
-+
-+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ int16x4_t res64;
-+ return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
-+}
-+
-+
-+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
-+ res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
-+ return64(res);
-+}
-+
-+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
-+{
-+ __m128 res;
-+ __m64_128 res64;
-+ res = _mm_mul_ps (_pM128(c), _pM128(b));
-+ res = _mm_sub_ps (_pM128(a), res);
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
-+ uint8x8_t res64;
-+ __m128i res;
-+ res64 = vmul_u8(b,c);
-+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
-+ return64(res);
-+}
-+
-+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
-+#define vmls_u16 vmls_s16
-+
-+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
-+#define vmls_u32 vmls_s32
-+
-+
-+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
-+ r16_1 = _mm_sub_epi8 (a, r16_1);
-+ //swap hi and low part of a, b, c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_sub_epi8 (a_2, r16_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_mullo_epi16 (c, b);
-+ return _mm_sub_epi16 (a, res);
-+}
-+
-+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1
-+ return _mm_sub_epi32 (a, res);
-+}
-+
-+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
-+{
-+ __m128 res;
-+ res = _mm_mul_ps (c, b);
-+ return _mm_sub_ps (a, res);
-+}
-+
-+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-+ r16_1 = _mm_sub_epi8 (a, r16_1);
-+ //swap hi and low part of a, b and c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_sub_epi8(a_2, r16_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
-+#define vmlsq_u16 vmlsq_s16
-+
-+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
-+#define vmlsq_u32 vmlsq_s32
-+
-+//******************** Vector multiply subtract long (widening multiply subtract) ************************************
-+//*************************************************************************************************************
-+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
-+{
-+ int16x8_t res;
-+ res = vmull_s8(b, c);
-+ return _mm_sub_epi16 (a, res);
-+}
-+
-+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int32x4_t res;
-+ res = vmull_s16(b, c);
-+ return _mm_sub_epi32 (a, res);
-+}
-+
-+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_s32( b,c);
-+ return _mm_sub_epi64 (a, res);
-+}
-+
-+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
-+{
-+ uint16x8_t res;
-+ res = vmull_u8(b, c);
-+ return _mm_sub_epi16 (a, res);
-+}
-+
-+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ uint32x4_t res;
-+ res = vmull_u16(b, c);
-+ return _mm_sub_epi32 (a, res);
-+}
-+
-+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_u32( b,c);
-+ return _mm_sub_epi64 (a, res);
-+}
-+
-+//****** Vector saturating doubling multiply high **********************
-+//*************************************************************************
-+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int16x4_t res;
-+ int32_t a32, b32, i;
-+ for (i = 0; i<4; i++) {
-+ a32 = (int32_t) a.m64_i16[i];
-+ b32 = (int32_t) b.m64_i16[i];
-+ a32 = (a32 * b32) >> 15;
-+ res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
-+ }
-+ return res;
-+}
-+
-+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
-+{
-+ //may be not optimal compared with a serial solution
-+ int32x2_t res64;
-+ __m128i mask;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ int64x2_t mul;
-+ mul = vmull_s32(a,b);
-+ mul = _mm_slli_epi64(mul,1); //double the result
-+ //at this point start treating 2 64-bit numbers as 4 32-bit
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+ return64(mul);
-+}
-+
-+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
-+{
-+ __m128i res, res_lo, mask;
-+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
-+ res = _mm_mulhi_epi16 (a, b);
-+ res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
-+ res_lo = _mm_mullo_epi16 (a, b);
-+ res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
-+ res = _mm_add_epi16(res, res_lo); //combine results
-+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
-+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000
-+}
-+
-+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-+ __m128i ab, ba, mask, mul, mul1;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
-+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul = _mm_slli_epi64(mul,1); //double the result
-+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
-+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
-+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul1 = _mm_slli_epi64(mul1,1); //double the result
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-+ mul = _mm_unpacklo_epi64(mul, mul1);
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+}
-+
-+//********* Vector saturating rounding doubling multiply high ****************
-+//****************************************************************************
-+//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
-+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //may be not optimal compared with a serial solution
-+ int32x2_t res64;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ __m128i res_sat, mask, mask1;
-+ int64x2_t mul;
-+ mul = vmull_s32(a,b);
-+ res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
-+ mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
-+ mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
-+ //at this point start treating 2 64-bit numbers as 4 32-bit
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+ return64(mul);
-+}
-+
-+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
-+{
-+ __m128i mask, res;
-+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
-+ res = _mm_mulhrs_epi16 (a, b);
-+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
-+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000
-+}
-+
-+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-+ __m128i ab, ba, mask, mul, mul1, mask1;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
-+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
-+ mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
-+ mul = _mm_add_epi32 (mul, mask1); //actual rounding
-+
-+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
-+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
-+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
-+ mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
-+ mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
-+ //at this point start treating 2 64-bit numbers as 4 32-bit
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-+ mul = _mm_unpacklo_epi64(mul, mul1);
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+}
-+
-+//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
-+//*************************************************************************************************************************
-+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
-+{
-+ //not optimal SIMD soulution, serial may be faster
-+ __m128i res32;
-+ res32 = vmull_s16(b, c);
-+ res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
-+ return vqaddq_s32(res32, a); //saturation
-+}
-+
-+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res64;
-+ res64 = vmull_s32(b,c);
-+ res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
-+ return vqaddq_s64(res64, a); //saturation
-+}
-+
-+//************************************************************************************
-+//****************** Vector subtract ***********************************************
-+//************************************************************************************
-+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res64;
-+ res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
-+ return res64;
-+}
-+
-+
-+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
-+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
-+ return res;
-+}
-+
-+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
-+#define vsub_u8 vsub_s8
-+
-+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
-+#define vsub_u16 vsub_s16
-+
-+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
-+#define vsub_u32 vsub_s32
-+
-+
-+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
-+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b)
-+{
-+ int64x1_t res64;
-+ res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
-+ return res64;
-+}
-+
-+
-+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
-+#define vsubq_s8 _mm_sub_epi8
-+
-+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
-+#define vsubq_s16 _mm_sub_epi16
-+
-+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
-+#define vsubq_s32 _mm_sub_epi32
-+
-+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
-+#define vsubq_s64 _mm_sub_epi64
-+
-+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
-+#define vsubq_f32 _mm_sub_ps
-+
-+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
-+#define vsubq_u8 _mm_sub_epi8
-+
-+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
-+#define vsubq_u16 _mm_sub_epi16
-+
-+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
-+#define vsubq_u32 _mm_sub_epi32
-+
-+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
-+#define vsubq_u64 _mm_sub_epi64
-+
-+//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
-+//***********************************************************************************
-+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a16, b16);
-+}
-+
-+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a32, b32);
-+}
-+
-+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi64 (a64, b64);
-+}
-+
-+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a16, b16);
-+}
-+
-+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a32, b32);
-+}
-+
-+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi64 (a64, b64);
-+}
-+
-+//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
-+//*****************************************************************************************************
-+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
-+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a, b16);
-+}
-+
-+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
-+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a, b32);
-+}
-+
-+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
-+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_sub_epi64 (a, b64);
-+}
-+
-+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
-+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a, b16);
-+}
-+
-+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
-+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a, b32);
-+}
-+
-+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
-+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_sub_epi64 (a, b64);
-+}
-+
-+//************************Vector saturating subtract *********************************
-+//*************************************************************************************
-+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
-+{
-+ uint64x1_t res;
-+ uint64_t a64,b64;
-+ a64 = a.m64_u64[0];
-+ b64 = b.m64_u64[0];
-+ res.m64_u64[0] = a64 - b64;
-+
-+ a64 = (a64 >> 63) + (~_SIGNBIT64);
-+ if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
-+ res.m64_u64[0] = a64;
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint64x1_t res;
-+ uint64_t a64, b64;
-+ a64 = _Ui64(a);
-+ b64 = _Ui64(b);
-+ if (a64 > b64) {
-+ res.m64_u64[0] = a64 - b64;
-+ } else {
-+ res.m64_u64[0] = 0;
-+ }
-+ return res;
-+}
-+
-+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
-+#define vqsubq_s8 _mm_subs_epi8
-+
-+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
-+#define vqsubq_s16 _mm_subs_epi16
-+
-+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
-+{
-+ //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
-+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
-+ c7fffffff = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_sub_epi32(a, b);
-+ res_sat = _mm_srli_epi32(a, 31);
-+ res_sat = _mm_add_epi32(res_sat, c7fffffff);
-+ res_xor_a = _mm_xor_si128(res, a);
-+ b_xor_a = _mm_xor_si128(b, a);
-+ res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
-+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-+ res_sat = _mm_and_si128(res_xor_a, res_sat);
-+ res = _mm_andnot_si128(res_xor_a, res);
-+ return _mm_or_si128(res, res_sat);
-+}
-+
-+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
-+{
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = atmp[0] - btmp[0];
-+ res[1] = atmp[1] - btmp[1];
-+ if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
-+ res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
-+ }
-+ if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
-+ res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
-+#define vqsubq_u8 _mm_subs_epu8
-+
-+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
-+#define vqsubq_u16 _mm_subs_epu16
-+
-+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
-+{
-+ __m128i min, mask, sub;
-+ min = _MM_MIN_EPU32(a, b); //SSE4.1
-+ mask = _mm_cmpeq_epi32 (min, b);
-+ sub = _mm_sub_epi32 (a, b);
-+ return _mm_and_si128 ( sub, mask);
-+}
-+
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
-+ {
-+ __m128i c80000000, subb, suba, cmp, sub;
-+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
-+ sub = _mm_sub_epi64 (a, b);
-+ suba = _mm_sub_epi64 (a, c80000000);
-+ subb = _mm_sub_epi64 (b, c80000000);
-+ cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
-+ return _mm_and_si128 (sub, cmp); //saturation
-+ }
-+#else
-+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+ {
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0;
-+ res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0;
-+ return _mm_load_si128((__m128i*)(res));
-+ }
-+#endif
-+
-+//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ******************************************************
-+//****************************************************************
-+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
-+{
-+ //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
-+ int8x8_t res64;
-+ __m128i r16;
-+ int8x8_t r;
-+ r = vsub_s8 (a, b);
-+ r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
-+ r16 = _mm_srai_epi16 (r16, 1); //SSE2
-+ r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits
-+ return64(r16);
-+}
-+
-+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+
-+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
-+{
-+ // //need to deal with the possibility of internal overflow
-+ __m128i c128, au,bu;
-+ c128 = _mm_set1_epi8 (128);
-+ au = _mm_add_epi8( a, c128);
-+ bu = _mm_add_epi8( b, c128);
-+ return vhsubq_u8(au,bu);
-+}
-+
-+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
-+{
-+ //need to deal with the possibility of internal overflow
-+ __m128i c8000, au,bu;
-+ c8000 = _mm_set1_epi16(0x8000);
-+ au = _mm_add_epi16( a, c8000);
-+ bu = _mm_add_epi16( b, c8000);
-+ return vhsubq_u16(au,bu);
-+}
-+
-+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
-+{
-+ //need to deal with the possibility of internal overflow
-+ __m128i a2, b2,r, b_1;
-+ a2 = _mm_srai_epi32 (a,1);
-+ b2 = _mm_srai_epi32 (b,1);
-+ r = _mm_sub_epi32 (a2, b2);
-+ b_1 = _mm_andnot_si128(a, b); //!a and b
-+ b_1 = _mm_slli_epi32 (b_1,31);
-+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
-+ return _mm_sub_epi32(r,b_1);
-+}
-+
-+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
-+{
-+ __m128i avg;
-+ avg = _mm_avg_epu8 (a, b);
-+ return _mm_sub_epi8(a, avg);
-+}
-+
-+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
-+{
-+ __m128i avg;
-+ avg = _mm_avg_epu16 (a, b);
-+ return _mm_sub_epi16(a, avg);
-+}
-+
-+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
-+{
-+ //need to deal with the possibility of internal overflow
-+ __m128i a2, b2,r, b_1;
-+ a2 = _mm_srli_epi32 (a,1);
-+ b2 = _mm_srli_epi32 (b,1);
-+ r = _mm_sub_epi32 (a2, b2);
-+ b_1 = _mm_andnot_si128(a, b); //!a and b
-+ b_1 = _mm_slli_epi32 (b_1,31);
-+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
-+ return _mm_sub_epi32(r,b_1);
-+}
-+
-+//******* Vector subtract high half (truncated) ** ************
-+//************************************************************
-+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sum, sum8;
-+ sum = _mm_sub_epi16 (a, b);
-+ sum8 = _mm_srai_epi16 (sum, 8);
-+ sum8 = _mm_packs_epi16(sum8,sum8);
-+ return64(sum8);
-+}
-+
-+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
-+{
-+ int16x4_t res64;
-+ __m128i sum, sum16;
-+ sum = _mm_sub_epi32 (a, b);
-+ sum16 = _mm_srai_epi32 (sum, 16);
-+ sum16 = _mm_packs_epi32(sum16,sum16);
-+ return64(sum16);
-+}
-+
-+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i sub;
-+ sub = _mm_sub_epi64 (a, b);
-+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
-+ return64(sub);
-+}
-+
-+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sum, sum8;
-+ sum = _mm_sub_epi16 (a, b);
-+ sum8 = _mm_srli_epi16 (sum, 8);
-+ sum8 = _mm_packus_epi16(sum8,sum8);
-+ return64(sum8);
-+}
-+
-+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
-+{
-+ uint16x4_t res64;
-+ __m128i sum, sum16;
-+ sum = _mm_sub_epi32 (a, b);
-+ sum16 = _mm_srli_epi32 (sum, 16);
-+ sum16 = _MM_PACKUS1_EPI32(sum16);
-+ return64(sum16);
-+}
-+
-+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
-+#define vsubhn_u64 vsubhn_s64
-+
-+//************ Vector rounding subtract high half *********************
-+//*********************************************************************
-+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sub = _mm_srai_epi16 (sub, 8); //get high half
-+ sub = _mm_add_epi16 (sub, mask1); //actual rounding
-+ sub = _mm_packs_epi16 (sub, sub);
-+ return64(sub);
-+}
-+
-+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int16x4_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sub = _mm_srai_epi32 (sub, 16); //get high half
-+ sub = _mm_add_epi32 (sub, mask1); //actual rounding
-+ sub = _mm_packs_epi32 (sub, sub);
-+ return64(sub);
-+}
-+
-+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int32x2_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi64 (a, b);
-+ mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
-+ sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
-+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
-+ return64(sub);
-+}
-+
-+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sub = _mm_srai_epi16 (sub, 8); //get high half
-+ sub = _mm_add_epi16 (sub, mask1); //actual rounding
-+ sub = _mm_packus_epi16 (sub, sub);
-+ return64(sub);
-+}
-+
-+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ uint16x4_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sub = _mm_srai_epi32 (sub, 16); //get high half
-+ sub = _mm_add_epi32 (sub, mask1); //actual rounding
-+ sub = _MM_PACKUS1_EPI32 (sub);
-+ return64(sub);
-+}
-+
-+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+#define vrsubhn_u64 vrsubhn_s64
-+
-+//*********** Vector saturating doubling multiply subtract long ********************
-+//************************************************************************************
-+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ //not optimal SIMD soulution, serial may be faster
-+ __m128i res32, mask;
-+ int32x4_t res;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ res = vmull_s16(b, c);
-+ res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
-+ mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
-+ res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000
-+ return vqsubq_s32(a, res32); //saturation
-+}
-+
-+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res64, mask;
-+ int64x2_t res;
-+ _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
-+ res = vmull_s32(b, c);
-+ res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
-+ mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
-+ res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000
-+ return vqsubq_s64(a, res64); //saturation
-+}
-+
-+//****************** COMPARISON ***************************************
-+//******************* Vector compare equal *************************************
-+//****************************************************************************
-+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
-+ return64f(res);
-+}
-+
-+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
-+#define vceq_p8 vceq_u8
-+
-+
-+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
-+#define vceqq_s8 _mm_cmpeq_epi8
-+
-+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
-+#define vceqq_s16 _mm_cmpeq_epi16
-+
-+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
-+#define vceqq_s32 _mm_cmpeq_epi32
-+
-+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmpeq_ps(a,b);
-+ return _M128i(res);
-+}
-+
-+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
-+#define vceqq_u8 _mm_cmpeq_epi8
-+
-+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
-+#define vceqq_u16 _mm_cmpeq_epi16
-+
-+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
-+#define vceqq_u32 _mm_cmpeq_epi32
-+
-+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
-+#define vceqq_p8 _mm_cmpeq_epi8
-+
-+//******************Vector compare greater-than or equal*************************
-+//*******************************************************************************
-+//in IA SIMD no greater-than-or-equal comparison for integers,
-+// there is greater-than available only, so we need the following tricks
-+
-+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
-+ return64f(res);
-+}
-+
-+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ //serial solution looks faster
-+ uint32x2_t res64;
-+ return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+
-+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
-+{
-+ __m128i m1, m2;
-+ m1 = _mm_cmpgt_epi8 ( a, b);
-+ m2 = _mm_cmpeq_epi8 ( a, b);
-+ return _mm_or_si128 ( m1, m2);
-+}
-+
-+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
-+{
-+ __m128i m1, m2;
-+ m1 = _mm_cmpgt_epi16 ( a, b);
-+ m2 = _mm_cmpeq_epi16 ( a, b);
-+ return _mm_or_si128 ( m1,m2);
-+}
-+
-+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
-+{
-+ __m128i m1, m2;
-+ m1 = _mm_cmpgt_epi32 (a, b);
-+ m2 = _mm_cmpeq_epi32 (a, b);
-+ return _mm_or_si128 (m1, m2);
-+}
-+
-+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmpge_ps(a,b); //use only 2 first entries
-+ return *(__m128i*)&res;
-+}
-+
-+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
-+{
-+ //no unsigned chars comparison, only signed available,so need the trick
-+ #ifdef USE_SSE4
-+ __m128i cmp;
-+ cmp = _mm_max_epu8(a, b);
-+ return _mm_cmpeq_epi8(cmp, a); //a>=b
-+ #else
-+ __m128i c128, as, bs, m1, m2;
-+ c128 = _mm_set1_epi8 (128);
-+ as = _mm_sub_epi8( a, c128);
-+ bs = _mm_sub_epi8( b, c128);
-+ m1 = _mm_cmpgt_epi8( as, bs);
-+ m2 = _mm_cmpeq_epi8 (as, bs);
-+ return _mm_or_si128 ( m1, m2);
-+ #endif
-+}
-+
-+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
-+{
-+ //no unsigned shorts comparison, only signed available,so need the trick
-+ #ifdef USE_SSE4
-+ __m128i cmp;
-+ cmp = _mm_max_epu16(a, b);
-+ return _mm_cmpeq_epi16(cmp, a); //a>=b
-+ #else
-+ __m128i c8000, as, bs, m1, m2;
-+ c8000 = _mm_set1_epi16 (0x8000);
-+ as = _mm_sub_epi16(a,c8000);
-+ bs = _mm_sub_epi16(b,c8000);
-+ m1 = _mm_cmpgt_epi16(as, bs);
-+ m2 = _mm_cmpeq_epi16 (as, bs);
-+ return _mm_or_si128 ( m1, m2);
-+ #endif
-+}
-+
-+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
-+{
-+ //no unsigned ints comparison, only signed available,so need the trick
-+ #ifdef USE_SSE4
-+ __m128i cmp;
-+ cmp = _mm_max_epu32(a, b);
-+ return _mm_cmpeq_epi32(cmp, a); //a>=b
-+ #else
-+ //serial solution may be faster
-+ __m128i c80000000, as, bs, m1, m2;
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+ as = _mm_sub_epi32(a,c80000000);
-+ bs = _mm_sub_epi32(b,c80000000);
-+ m1 = _mm_cmpgt_epi32 (as, bs);
-+ m2 = _mm_cmpeq_epi32 (as, bs);
-+ return _mm_or_si128 ( m1, m2);
-+ #endif
-+}
-+
-+//**********************Vector compare less-than or equal******************************
-+//***************************************************************************************
-+//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
-+
-+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vcleq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vcleq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vcleq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
-+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmple_ps(_pM128(a),_pM128(b));
-+ return64f(res);
-+}
-+
-+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+#define vcle_u8(a,b) vcge_u8(b,a)
-+
-+
-+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
-+#define vcle_u16(a,b) vcge_u16(b,a)
-+
-+
-+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+#define vcle_u32(a,b) vcge_u32(b,a)
-+
-+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
-+{
-+ __m128i c1, res;
-+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
-+ res = _mm_cmpgt_epi8 ( a, b);
-+ return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
-+}
-+
-+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
-+{
-+ __m128i c1, res;
-+ c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
-+ res = _mm_cmpgt_epi16 ( a, b);
-+ return _mm_andnot_si128 (res, c1);
-+}
-+
-+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
-+{
-+ __m128i c1, res;
-+ c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
-+ res = _mm_cmpgt_epi32 ( a, b);
-+ return _mm_andnot_si128 (res, c1);
-+}
-+
-+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmple_ps(a,b);
-+ return *(__m128i*)&res;
-+}
-+
-+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
-+ {
-+ //no unsigned chars comparison in SSE, only signed available,so need the trick
-+ __m128i cmp;
-+ cmp = _mm_min_epu8(a, b);
-+ return _mm_cmpeq_epi8(cmp, a); //a<=b
-+ }
-+#else
-+ #define vcleq_u8(a,b) vcgeq_u8(b,a)
-+#endif
-+
-+
-+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
-+ {
-+ //no unsigned shorts comparison in SSE, only signed available,so need the trick
-+ __m128i cmp;
-+ cmp = _mm_min_epu16(a, b);
-+ return _mm_cmpeq_epi16(cmp, a); //a<=b
-+ }
-+#else
-+ #define vcleq_u16(a,b) vcgeq_u16(b,a)
-+#endif
-+
-+
-+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
-+ {
-+ //no unsigned chars comparison in SSE, only signed available,so need the trick
-+ __m128i cmp;
-+ cmp = _mm_min_epu32(a, b);
-+ return _mm_cmpeq_epi32(cmp, a); //a<=b
-+ }
-+#else
-+//solution may be not optimal compared with the serial one
-+ #define vcleq_u32(a,b) vcgeq_u32(b,a)
-+#endif
-+
-+
-+//****** Vector compare greater-than ******************************************
-+//**************************************************************************
-+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
-+ return64f(res);
-+}
-+
-+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+#define vcgtq_s8 _mm_cmpgt_epi8
-+
-+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+#define vcgtq_s16 _mm_cmpgt_epi16
-+
-+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+#define vcgtq_s32 _mm_cmpgt_epi32
-+
-+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmpgt_ps(a,b); //use only 2 first entries
-+ return *(__m128i*)&res;
-+}
-+
-+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
-+{
-+ //no unsigned chars comparison, only signed available,so need the trick
-+ __m128i c128, as, bs;
-+ c128 = _mm_set1_epi8 (128);
-+ as = _mm_sub_epi8(a,c128);
-+ bs = _mm_sub_epi8(b,c128);
-+ return _mm_cmpgt_epi8 (as, bs);
-+}
-+
-+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
-+{
-+ //no unsigned short comparison, only signed available,so need the trick
-+ __m128i c8000, as, bs;
-+ c8000 = _mm_set1_epi16 (0x8000);
-+ as = _mm_sub_epi16(a,c8000);
-+ bs = _mm_sub_epi16(b,c8000);
-+ return _mm_cmpgt_epi16 ( as, bs);
-+}
-+
-+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
-+{
-+ //no unsigned int comparison, only signed available,so need the trick
-+ __m128i c80000000, as, bs;
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+ as = _mm_sub_epi32(a,c80000000);
-+ bs = _mm_sub_epi32(b,c80000000);
-+ return _mm_cmpgt_epi32 ( as, bs);
-+}
-+
-+//********************* Vector compare less-than **************************
-+//*************************************************************************
-+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
-+
-+
-+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
-+
-+
-+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
-+
-+
-+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
-+
-+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
-+
-+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
-+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
-+
-+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
-+
-+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
-+
-+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
-+
-+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
-+
-+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
-+
-+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
-+
-+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
-+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
-+
-+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
-+
-+//*****************Vector compare absolute greater-than or equal ************
-+//***************************************************************************
-+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpge_ps ( a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpge_ps ( a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//********Vector compare absolute less-than or equal ******************
-+//********************************************************************
-+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmple_ps (a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmple_ps (a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//******** Vector compare absolute greater-than ******************
-+//******************************************************************
-+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpgt_ps (a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpgt_ps (a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//***************Vector compare absolute less-than ***********************
-+//*************************************************************************
-+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmplt_ps (a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmplt_ps (a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//*************************Vector test bits************************************
-+//*****************************************************************************
-+/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
-+with the corresponding element of a second vector. If the result is not zero, the
-+corresponding element in the destination vector is set to all ones. Otherwise, it is set to
-+all zeros. */
-+
-+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vtstq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vtstq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vtstq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
-+#define vtst_u8 vtst_s8
-+
-+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
-+#define vtst_u16 vtst_s16
-+
-+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
-+#define vtst_u32 vtst_s32
-+
-+
-+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
-+#define vtst_p8 vtst_u8
-+
-+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
-+{
-+ __m128i zero, one, res;
-+ zero = _mm_setzero_si128 ();
-+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-+ res = _mm_and_si128 (a, b);
-+ res = _mm_cmpeq_epi8 (res, zero);
-+ return _mm_xor_si128(res, one); //invert result
-+}
-+
-+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
-+{
-+ __m128i zero, one, res;
-+ zero = _mm_setzero_si128 ();
-+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-+ res = _mm_and_si128 (a, b);
-+ res = _mm_cmpeq_epi16 (res, zero);
-+ return _mm_xor_si128(res, one); //invert result
-+}
-+
-+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
-+{
-+ __m128i zero, one, res;
-+ zero = _mm_setzero_si128 ();
-+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-+ res = _mm_and_si128 (a, b);
-+ res = _mm_cmpeq_epi32 (res, zero);
-+ return _mm_xor_si128(res, one); //invert result
-+}
-+
-+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
-+#define vtstq_u8 vtstq_s8
-+
-+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
-+#define vtstq_u16 vtstq_s16
-+
-+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
-+#define vtstq_u32 vtstq_s32
-+
-+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
-+#define vtstq_p8 vtstq_u8
-+
-+//****************** Absolute difference ********************
-+//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
-+//************************************************************
-+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vabdq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vabdq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vabdq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vabdq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vabdq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vabdq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = vabdq_f32(_pM128(a), _pM128(b));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_sub_epi8 (a, b);
-+ return _mm_abs_epi8 (res);
-+}
-+
-+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_sub_epi16 (a,b);
-+ return _mm_abs_epi16 (res);
-+}
-+
-+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_sub_epi32 (a,b);
-+ return _mm_abs_epi32 (res);
-+}
-+
-+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
-+{
-+ __m128i cmp, difab, difba;
-+ cmp = vcgtq_u8(a,b);
-+ difab = _mm_sub_epi8(a,b);
-+ difba = _mm_sub_epi8 (b,a);
-+ difab = _mm_and_si128(cmp, difab);
-+ difba = _mm_andnot_si128(cmp, difba);
-+ return _mm_or_si128(difab, difba);
-+}
-+
-+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
-+{
-+ __m128i cmp, difab, difba;
-+ cmp = vcgtq_u16(a,b);
-+ difab = _mm_sub_epi16(a,b);
-+ difba = _mm_sub_epi16 (b,a);
-+ difab = _mm_and_si128(cmp, difab);
-+ difba = _mm_andnot_si128(cmp, difba);
-+ return _mm_or_si128(difab, difba);
-+}
-+
-+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
-+{
-+ __m128i cmp, difab, difba;
-+ cmp = vcgtq_u32(a,b);
-+ difab = _mm_sub_epi32(a,b);
-+ difba = _mm_sub_epi32 (b,a);
-+ difab = _mm_and_si128(cmp, difab);
-+ difba = _mm_andnot_si128(cmp, difba);
-+ return _mm_or_si128(difab, difba);
-+}
-+
-+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
-+{
-+ __m128i c1;
-+ __m128 res;
-+ c1 = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_sub_ps (a, b);
-+ return _mm_and_ps (res, *(__m128*)&c1);
-+}
-+
-+//************ Absolute difference - long **************************
-+//********************************************************************
-+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return vabdq_s16(a16, b16);
-+
-+}
-+
-+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return vabdq_s32(a32, b32);
-+}
-+
-+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //no optimal SIMD solution, serial looks faster
-+ _NEON2SSE_ALIGN_16 int64_t res[2];
-+ if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
-+ else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
-+ if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
-+ else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ __m128i res;
-+ res = vsubl_u8(a,b);
-+ return _mm_abs_epi16(res);
-+}
-+
-+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ __m128i res;
-+ res = vsubl_u16(a,b);
-+ return _mm_abs_epi32(res);
-+}
-+
-+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
-+ else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
-+ if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
-+ else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
-+//*********************************************************************************************
-+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c)
-+{
-+ int8x8_t res64;
-+ return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
-+}
-+
-+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ int16x4_t res64;
-+ return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
-+}
-+
-+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c)
-+{
-+ int32x2_t res64;
-+ return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
-+}
-+
-+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
-+#define vaba_u8 vaba_s8
-+
-+
-+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
-+#define vaba_u16 vaba_s16
-+
-+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c)
-+{
-+ uint32x2_t res64;
-+ return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
-+}
-+
-+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
-+{
-+ int8x16_t sub;
-+ sub = vabdq_s8(b, c);
-+ return vaddq_s8( a, sub);
-+}
-+
-+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
-+{
-+ int16x8_t sub;
-+ sub = vabdq_s16(b, c);
-+ return vaddq_s16( a, sub);
-+}
-+
-+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
-+{
-+ int32x4_t sub;
-+ sub = vabdq_s32(b, c);
-+ return vaddq_s32( a, sub);
-+}
-+
-+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
-+{
-+ uint8x16_t sub;
-+ sub = vabdq_u8(b, c);
-+ return vaddq_u8( a, sub);
-+}
-+
-+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
-+{
-+ uint16x8_t sub;
-+ sub = vabdq_u16(b, c);
-+ return vaddq_u16( a, sub);
-+}
-+
-+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
-+{
-+ uint32x4_t sub;
-+ sub = vabdq_u32(b, c);
-+ return vaddq_u32( a, sub);
-+}
-+
-+//************** Absolute difference and accumulate - long ********************************
-+//*************************************************************************************
-+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
-+{
-+ __m128i b16, c16, res;
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
-+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
-+ return _mm_add_epi16 (a, res);
-+}
-+
-+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
-+{
-+ __m128i b32, c32, res;
-+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
-+ c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
-+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
-+ return _mm_add_epi32 (a, res);
-+}
-+
-+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res;
-+ res = vabdl_s32(b,c);
-+ return _mm_add_epi64(a, res);
-+}
-+
-+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+ __m128i b16, c16, res;
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-+ c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
-+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
-+ return _mm_add_epi16 (a, res);
-+}
-+
-+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
-+{
-+ __m128i b32, c32, res;
-+ b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
-+ c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
-+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
-+ return _mm_add_epi32 (a, res);
-+}
-+
-+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res;
-+ res = vabdl_u32(b,c);
-+ return _mm_add_epi64(a, res);
-+}
-+
-+//***********************************************************************************
-+//**************** Maximum and minimum operations **********************************
-+//***********************************************************************************
-+//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *******
-+//***********************************************************************************
-+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i res;
-+ res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
-+ return64(res);
-+}
-+
-+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
-+ res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
-+ return res;
-+}
-+
-+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
-+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
-+
-+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
-+#define vmaxq_s16 _mm_max_epi16
-+
-+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
-+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
-+
-+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
-+#define vmaxq_u8 _mm_max_epu8
-+
-+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
-+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
-+
-+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
-+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
-+
-+
-+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
-+#define vmaxq_f32 _mm_max_ps
-+
-+//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
-+//***********************************************************************************************************
-+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i res;
-+ res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
-+ return64(res);
-+}
-+
-+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
-+ res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
-+ return res;
-+}
-+
-+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
-+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
-+
-+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
-+#define vminq_s16 _mm_min_epi16
-+
-+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
-+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
-+
-+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
-+#define vminq_u8 _mm_min_epu8
-+
-+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
-+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
-+
-+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
-+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
-+
-+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
-+#define vminq_f32 _mm_min_ps
-+
-+//************* Pairwise addition operations. **************************************
-+//************************************************************************************
-+//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
-+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit and then pack
-+ int8x8_t res64;
-+ __m128i a16, b16, res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
-+ res = _mm_hadd_epi16 (a16, b16);
-+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ __m128i hadd128;
-+ hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
-+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(hadd128);
-+}
-+
-+
-+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i hadd128;
-+ hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
-+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(hadd128);
-+}
-+
-+
-+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
-+{
-+ // no 8 bit hadd in IA32, need to go to 16 bit and then pack
-+ uint8x8_t res64;
-+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
-+ __m128i mask8, a16, b16, res;
-+ mask8 = _mm_set1_epi16(0xff);
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
-+ res = _mm_hadd_epi16 (a16, b16);
-+ res = _mm_and_si128(res, mask8); //to avoid saturation
-+ res = _mm_packus_epi16 (res,res); //use low 64 bits
-+ return64(res);
-+}
-+
-+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
-+{
-+ // solution may be not optimal, serial execution may be faster
-+ // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
-+ uint16x4_t res64;
-+ __m128i c32767, cfffe, as, bs, res;
-+ c32767 = _mm_set1_epi16 (32767);
-+ cfffe = _mm_set1_epi16 (0xfffe);
-+ as = _mm_sub_epi16 (_pM128i(a), c32767);
-+ bs = _mm_sub_epi16 (_pM128i(b), c32767);
-+ res = _mm_hadd_epi16 (as, bs);
-+ res = _mm_add_epi16 (res, cfffe);
-+ res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(res);
-+}
-+
-+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
-+{
-+ //hadd doesn't work for unsigned values
-+ uint32x2_t res64;
-+ __m128i ab, ab_sh, res;
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
-+ ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
-+ res = _mm_add_epi32(ab, ab_sh);
-+ res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(res);
-+}
-+
-+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
-+{
-+ __m128 hadd128;
-+ __m64_128 res64;
-+ hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
-+ hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
-+ _M64f(res64, hadd128);
-+ return res64;
-+}
-+
-+
-+//************************** Long pairwise add **********************************
-+//*********************************************************************************
-+//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
-+// and places the final results in the destination vector.
-+
-+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
-+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit anyway
-+ __m128i a16;
-+ int16x4_t res64;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
-+ return64(a16);
-+}
-+
-+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
-+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
-+{
-+ // solution may be not optimal, serial execution may be faster
-+ int32x2_t res64;
-+ __m128i r32_1;
-+ r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
-+ r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
-+ return64(r32_1);
-+}
-+
-+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
-+ return res;
-+}
-+
-+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
-+{
-+ // no 8 bit hadd in IA32, need to go to 16 bit
-+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
-+ uint16x4_t res64;
-+ __m128i a16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
-+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
-+ return64(a16);
-+}
-+
-+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than a SIMD one
-+ uint32x2_t res;
-+ res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
-+ res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
-+ return res;
-+}
-+
-+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
-+ return res;
-+}
-+
-+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
-+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit
-+ __m128i r16_1, r16_2;
-+ r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
-+ //swap hi and low part of r to process the remaining data
-+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
-+ return _mm_hadd_epi16 (r16_1, r16_2);
-+}
-+
-+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
-+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit
-+ __m128i r32_1, r32_2;
-+ r32_1 = _MM_CVTEPI16_EPI32(a);
-+ //swap hi and low part of r to process the remaining data
-+ r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
-+ return _mm_hadd_epi32 (r32_1, r32_2);
-+}
-+
-+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int32_t atmp[4];
-+ _NEON2SSE_ALIGN_16 int64_t res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
-+ res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
-+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit
-+ __m128i r16_1, r16_2;
-+ r16_1 = _MM_CVTEPU8_EPI16(a);
-+ //swap hi and low part of r to process the remaining data
-+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
-+ return _mm_hadd_epi16 (r16_1, r16_2);
-+}
-+
-+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than a SIMD one
-+ _NEON2SSE_ALIGN_16 uint16_t atmp[8];
-+ _NEON2SSE_ALIGN_16 uint32_t res[4];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
-+ res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
-+ res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
-+ res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint32_t atmp[4];
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
-+ res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//************************ Long pairwise add and accumulate **************************
-+//****************************************************************************************
-+//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
-+// and accumulates the values of the results into the elements of the destination (wide) vector
-+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
-+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b)
-+{
-+ int16x4_t res64;
-+ return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
-+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b)
-+{
-+ int32x2_t res64;
-+ return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
-+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
-+ return res;
-+}
-+
-+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0
-+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
-+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
-+ return res;
-+}
-+
-+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
-+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
-+{
-+ int16x8_t pad;
-+ pad = vpaddlq_s8(b);
-+ return _mm_add_epi16 (a, pad);
-+}
-+
-+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
-+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
-+{
-+ int32x4_t pad;
-+ pad = vpaddlq_s16(b);
-+ return _mm_add_epi32(a, pad);
-+}
-+
-+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
-+_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
-+{
-+ int64x2_t pad;
-+ pad = vpaddlq_s32(b);
-+ return _mm_add_epi64 (a, pad);
-+}
-+
-+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
-+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
-+{
-+ uint16x8_t pad;
-+ pad = vpaddlq_u8(b);
-+ return _mm_add_epi16 (a, pad);
-+}
-+
-+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x4_t pad;
-+ pad = vpaddlq_u16(b);
-+ return _mm_add_epi32(a, pad);
-+} //no optimal SIMD solution, serial is faster
-+
-+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //no optimal SIMD solution, serial is faster
-+ uint64x2_t pad;
-+ pad = vpaddlq_u32(b);
-+ return _mm_add_epi64(a, pad);
-+} //no optimal SIMD solution, serial is faster
-+
-+//********** Folding maximum *************************************
-+//*******************************************************************
-+//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
-+//and copies the larger of each pair into the corresponding element in the destination
-+// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
-+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
-+{
-+ int8x8_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-+ max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(max); //we need 64 bits only
-+}
-+
-+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ int16x4_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-+ max = _mm_max_epi16 (ab, ab1);
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(max);
-+}
-+
-+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ int32x2_t res;
-+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-+ return res;
-+}
-+
-+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
-+{
-+ uint8x8_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-+ max = _mm_max_epu8 (ab, ab1); // SSE4.1
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(max);
-+}
-+
-+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ uint16x4_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-+ max = _MM_MAX_EPU16 (ab, ab1);
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(max);
-+}
-+
-+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ uint32x2_t res;
-+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-+ return res;
-+} //serial solution looks faster than a SIMD one
-+
-+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
-+ res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
-+ return res;
-+}
-+
-+// ***************** Folding minimum ****************************
-+// **************************************************************
-+//vpmin -> takes minimum of adjacent pairs
-+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
-+{
-+ int8x8_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
-+ min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(min);
-+}
-+
-+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ int16x4_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-+ min = _mm_min_epi16 (ab, ab1);
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(min);
-+}
-+
-+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ int32x2_t res;
-+ res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-+ res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-+ return res;
-+}
-+
-+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
-+{
-+ uint8x8_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-+ min = _mm_min_epu8 (ab, ab1); // SSE4.1
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(min);
-+}
-+
-+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ uint16x4_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
-+ min = _MM_MIN_EPU16 (ab, ab1);
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(min);
-+}
-+
-+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ uint32x2_t res;
-+ res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
-+ res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
-+ return res;
-+}
-+
-+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
-+ res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
-+ return res;
-+}
-+
-+//***************************************************************
-+//*********** Reciprocal/Sqrt ************************************
-+//***************************************************************
-+//****************** Reciprocal estimate *******************************
-+//the ARM NEON and x86 SIMD results may be slightly different
-+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = _mm_rcp_ps(_pM128(a));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!! No reciprocal for ints in IA32 available
-+ uint32x2_t res;
-+ float resf, r;
-+ int i, q, s;
-+ for (i =0; i<2; i++){
-+ if((a.m64_u32[i] & 0x80000000) == 0) {
-+ res.m64_u32[i] = 0xffffffff;
-+ }else{
-+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
-+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
-+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res.m64_u32[i] = r * (uint32_t)(1 << 31);
-+ }
-+ }
-+ return res;
-+}
-+
-+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
-+#define vrecpeq_f32 _mm_rcp_ps
-+
-+
-+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!!
-+ //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
-+ _NEON2SSE_ALIGN_16 uint32_t atmp[4];
-+ _NEON2SSE_ALIGN_16 uint32_t res[4];
-+ _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
-+ float resf, r;
-+ int i, q, s;
-+ __m128i res128, mask, zero;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ zero = _mm_setzero_si128();
-+ for (i =0; i<4; i++){
-+ resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
-+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
-+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
-+ }
-+ res128 = _mm_load_si128((__m128i*)res);
-+ mask = _mm_and_si128(a, *(__m128i*)c80000000);
-+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff
-+ return _mm_or_si128(res128, mask);
-+}
-+
-+//**********Reciprocal square root estimate ****************
-+//**********************************************************
-+//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
-+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
-+////the ARM NEON and x86 SIMD results may be slightly different
-+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = _mm_rsqrt_ps(_pM128(a));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!!
-+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
-+ uint32x2_t res;
-+ __m128 tmp;
-+ float r, resf, coeff;
-+ int i,q0, q1, s;;
-+ for (i =0; i<2; i++){
-+ if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
-+ res.m64_u32[i] = 0xffffffff;
-+ }else{
-+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
-+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
-+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
-+ r = ((float)q0 + 0.5) / coeff;
-+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
-+ _mm_store_ss(&r, tmp);
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res.m64_u32[i] = r * (((uint32_t)1) << 31);
-+ }
-+ }
-+ return res;
-+}
-+
-+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
-+#define vrsqrteq_f32 _mm_rsqrt_ps
-+
-+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!!
-+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
-+ _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
-+ _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)};
-+ _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
-+ __m128 tmp;
-+ __m128i res128, mask, zero;
-+ float r, resf, coeff;
-+ int i,q0, q1, s;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ zero = _mm_setzero_si128();
-+ for (i =0; i<4; i++){
-+ resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
-+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
-+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
-+ r = ((float)q0 + 0.5) / coeff;
-+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
-+ _mm_store_ss(&r, tmp);
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
-+ }
-+ res128 = _mm_load_si128((__m128i*)res);
-+ mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
-+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff
-+ return _mm_or_si128(res128, mask);
-+}
-+//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
-+//******************************************************************************************
-+//******VRECPS (Vector Reciprocal Step) ***************************************************
-+//multiplies the elements of one vector by the corresponding elements of another vector,
-+//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
-+
-+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
-+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = vrecpsq_f32(_pM128(a), _pM128(b));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
-+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
-+{
-+ __m128 f2, mul;
-+ f2 = _mm_set1_ps(2.);
-+ mul = _mm_mul_ps(a,b);
-+ return _mm_sub_ps(f2,mul);
-+}
-+
-+//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
-+//multiplies the elements of one vector by the corresponding elements of another vector,
-+//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
-+
-+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
-+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
-+ res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
-+ return res;
-+}
-+
-+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
-+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
-+{
-+ __m128 f3, f05, mul;
-+ f3 = _mm_set1_ps(3.);
-+ f05 = _mm_set1_ps(0.5);
-+ mul = _mm_mul_ps(a,b);
-+ f3 = _mm_sub_ps(f3,mul);
-+ return _mm_mul_ps (f3, f05);
-+}
-+//********************************************************************************************
-+//***************************** Shifts by signed variable ***********************************
-+//********************************************************************************************
-+//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
-+//********************************************************************************************
-+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
-+//helper macro. It matches ARM implementation for big shifts
-+#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
-+ else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
-+ for (i = 0; i<LEN; i++) { \
-+ if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
-+ else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
-+ return res;
-+
-+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(8, i, 8)
-+}
-+
-+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(16, i, 4)
-+}
-+
-+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(32, i, 2)
-+}
-+
-+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(64, i, 1)
-+}
-+
-+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(8, u, 8)
-+}
-+
-+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(16, u, 4)
-+}
-+
-+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(32, u, 2)
-+}
-+
-+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
-+{
-+ SERIAL_SHIFT_64(64, u, 1)
-+}
-+
-+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int8_t, int8_t, 16, 16)
-+}
-+
-+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int16_t, int16_t, 8, 8)
-+}
-+
-+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int32_t, int32_t, 4, 4)
-+}
-+
-+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int64_t, int64_t, 2, 2)
-+}
-+
-+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
-+}
-+
-+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
-+}
-+
-+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
-+}
-+
-+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
-+}
-+
-+
-+//*********** Vector saturating shift left: (negative values shift right) **********************
-+//********************************************************************************************
-+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
-+#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
-+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) res[i] = 0; \
-+ else{ \
-+ if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
-+ else{ \
-+ if (btmp[i]>lanesize_1) { \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
-+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ else res[i] = atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
-+ TYPE lanesize = (sizeof(TYPE) << 3); \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) {res[i] = 0; \
-+ }else{ \
-+ if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
-+ else{ \
-+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
-+ else{ \
-+ limit = (TYPE) 1 << (lanesize - btmp[i]); \
-+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
-+ int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
-+ else{ \
-+ if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize_1) { \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
-+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
-+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-+ return res;
-+
-+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
-+ int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
-+ }else{ \
-+ if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
-+ else{ \
-+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
-+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
-+ return res;
-+
-+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
-+}
-+
-+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
-+}
-+
-+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
-+}
-+
-+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
-+}
-+
-+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
-+}
-+
-+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
-+}
-+
-+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
-+}
-+
-+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
-+}
-+
-+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
-+}
-+
-+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
-+}
-+
-+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
-+}
-+
-+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
-+}
-+
-+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
-+}
-+
-+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
-+}
-+
-+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
-+}
-+
-+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
-+}
-+
-+
-+//******** Vector rounding shift left: (negative values shift right) **********
-+//****************************************************************************
-+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
-+//rounding makes sense for right shifts only.
-+#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if( btmp[i] >= 0) { \
-+ if(btmp[i] >= lanesize) res[i] = 0; \
-+ else res[i] = (atmp[i] << btmp[i]); \
-+ }else{ \
-+ res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
-+ (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
-+ (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+
-+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
-+ for (i = 0; i<LEN; i++) { \
-+ if( b.m64_i ## TYPE[i] >= 0) { \
-+ if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
-+ else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
-+ }else{ \
-+ res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
-+ (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
-+ (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
-+ return res;
-+
-+
-+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(8,i,8)
-+}
-+
-+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(16,i,4)
-+}
-+
-+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(32,i,2)
-+}
-+
-+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(64,i,1)
-+}
-+
-+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(8,u,8)
-+}
-+
-+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(16,u,4)
-+}
-+
-+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(32,u,2)
-+}
-+
-+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(64,u,1)
-+}
-+
-+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
-+}
-+
-+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
-+}
-+
-+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
-+}
-+
-+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
-+}
-+
-+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
-+}
-+
-+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
-+}
-+
-+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
-+}
-+
-+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
-+}
-+
-+
-+//********** Vector saturating rounding shift left: (negative values shift right) ****************
-+//*************************************************************************************************
-+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
-+//Saturation happens for left shifts only while rounding makes sense for right shifts only.
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
-+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) res[i] = 0; \
-+ else{ \
-+ if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
-+ else{ \
-+ if (btmp[i]>lanesize_1) { \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
-+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ else res[i] = atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
-+ int lanesize = (sizeof(TYPE) << 3); \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) {res[i] = 0; \
-+ }else{ \
-+ if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
-+ else{ \
-+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
-+ else{ \
-+ limit = (TYPE) 1 << (lanesize - btmp[i]); \
-+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
-+ __m64_128 res; int ## TYPE ## _t limit; int i; \
-+ int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
-+ else{ \
-+ if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize_1) { \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
-+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
-+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-+ return res;
-+
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
-+ __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
-+ int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
-+ }else{ \
-+ if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
-+ else{ \
-+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
-+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-+ return res;
-+
-+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
-+}
-+
-+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
-+}
-+
-+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
-+}
-+
-+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
-+}
-+
-+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
-+}
-+
-+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
-+}
-+
-+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
-+}
-+
-+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
-+}
-+
-+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
-+}
-+
-+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
-+}
-+
-+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
-+}
-+
-+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
-+}
-+
-+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
-+}
-+
-+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
-+}
-+
-+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
-+}
-+
-+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
-+}
-+
-+// *********************************************************************************
-+// ***************************** Shifts by a constant *****************************
-+// *********************************************************************************
-+//**************** Vector shift right by constant*************************************
-+//************************************************************************************
-+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ int8x8_t res64;
-+ __m128i r;
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_srai_epi16 (r, b); //SSE2
-+ r = _mm_packs_epi16 (r,r); //we need 64 bits only
-+ return64(r);
-+}
-+
-+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_srai_epi16(_pM128i(a), b));
-+}
-+
-+
-+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_srai_epi32(_pM128i(a), b));
-+}
-+
-+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //no arithmetic shift for 64bit values, serial solution used
-+ int64x1_t res;
-+ if(b>=64) res.m64_i64[0] = 0;
-+ else res.m64_i64[0] = (*(int64_t*)&a) >> b;
-+ return res;
-+}
-+
-+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ uint8x8_t res64;
-+ __m128i r;
-+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
-+ r = _mm_packus_epi16 (r,r); //we need 64 bits only
-+ return64(r);
-+}
-+
-+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_srli_epi16(_pM128i(a), b));
-+}
-+
-+
-+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res64;
-+ return64(_mm_srli_epi32(_pM128i(a), b));
-+}
-+
-+
-+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
-+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
-+{
-+ uint64x1_t res64;
-+ return64(_mm_srli_epi64(_pM128i(a), b));
-+}
-+
-+
-+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit trick
-+ __m128i zero, mask0, a_sign, r, a_sign_mask;
-+ _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff};
-+ zero = _mm_setzero_si128();
-+ mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
-+ a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
-+ r = _mm_srai_epi16 (a, b);
-+ a_sign_mask = _mm_and_si128 (mask0, a_sign);
-+ r = _mm_andnot_si128 (mask0, r);
-+ return _mm_or_si128 (r, a_sign_mask);
-+}
-+
-+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
-+#define vshrq_n_s16 _mm_srai_epi16
-+
-+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
-+#define vshrq_n_s32 _mm_srai_epi32
-+
-+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-+{
-+ //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
-+ __m128i c1, signmask,a0, res64;
-+ _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
-+ c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
-+ signmask = _mm_slli_epi64 (c1, (64 - b));
-+ a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
-+ a0 = _MM_CMPEQ_EPI64 (a, a0);
-+ signmask = _mm_and_si128(a0, signmask);
-+ res64 = _mm_srli_epi64 (a, b);
-+ return _mm_or_si128(res64, signmask);
-+}
-+
-+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
-+{
-+ //no 8 bit shift available, need the special trick
-+ __m128i mask0, r;
-+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00};
-+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
-+ r = _mm_srli_epi16 ( a, b);
-+ return _mm_and_si128 (r, mask0);
-+}
-+
-+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
-+#define vshrq_n_u16 _mm_srli_epi16
-+
-+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
-+#define vshrq_n_u32 _mm_srli_epi32
-+
-+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
-+#define vshrq_n_u64 _mm_srli_epi64
-+
-+//*************************** Vector shift left by constant *************************
-+//*********************************************************************************
-+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ int8x8_t res64;
-+ __m128i r;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_slli_epi16 (r, b); //SSE2
-+ r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
-+ return64(r);
-+}
-+
-+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_slli_epi16(_pM128i(a), b));
-+}
-+
-+
-+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_slli_epi32(_pM128i(a), b));
-+}
-+
-+
-+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b)
-+{
-+ int64x1_t res64;
-+ return64(_mm_slli_epi64(_pM128i(a), b));
-+}
-+
-+
-+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ uint8x8_t res64;
-+ __m128i mask8;
-+ __m128i r;
-+ mask8 = _mm_set1_epi16(0xff);
-+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_slli_epi16 (r, b); //SSE2
-+ r = _mm_and_si128(r, mask8); //to avoid saturation
-+ r = _mm_packus_epi16 (r,r); //we need 64 bits only
-+ return64(r);
-+}
-+
-+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+#define vshl_n_u16 vshl_n_s16
-+
-+
-+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+#define vshl_n_u32 vshl_n_s32
-+
-+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+#define vshl_n_u64 vshl_n_s64
-+
-+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+#define vshlq_n_s8 vshlq_n_u8
-+
-+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+#define vshlq_n_s16 _mm_slli_epi16
-+
-+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+#define vshlq_n_s32 _mm_slli_epi32
-+
-+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+#define vshlq_n_s64 _mm_slli_epi64
-+
-+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
-+{
-+ //no 8 bit shift available, need the special trick
-+ __m128i mask0, r;
-+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff};
-+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
-+ r = _mm_slli_epi16 ( a, b);
-+ return _mm_and_si128 (r, mask0);
-+}
-+
-+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+#define vshlq_n_u16 vshlq_n_s16
-+
-+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+#define vshlq_n_u32 vshlq_n_s32
-+
-+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+#define vshlq_n_u64 vshlq_n_s64
-+
-+//************* Vector rounding shift right by constant ******************
-+//*************************************************************************
-+//No corresponding x86 intrinsics exist, need to do some tricks
-+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ int8x8_t res64;
-+ __m128i r, maskb;
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
-+ r = _mm_srai_epi16 (r, b);
-+ r = _mm_add_epi16 (r, maskb); //actual rounding
-+ r = _mm_packs_epi16 (r,r); ////we need 64 bits only
-+ return64(r);
-+}
-+
-+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b)
-+{
-+ int16x4_t res64;
-+ return64(vrshrq_n_s16(_pM128i(a), b));
-+}
-+
-+
-+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ return64(vrshrq_n_s32(_pM128i(a), b));
-+}
-+
-+
-+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution is faster
-+ int64x1_t res;
-+ int64_t a_i64 = *( int64_t*)&a;
-+ if(b==64) {
-+ res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
-+ } else {
-+ int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
-+ res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
-+ uint8x8_t res64;
-+ __m128i r, maskb;
-+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
-+ r = _mm_srli_epi16 (r, b);
-+ r = _mm_add_epi16 (r, maskb); //actual rounding
-+ r = _mm_packus_epi16 (r,r); ////we need 64 bits only
-+ return64(r);
-+}
-+
-+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
-+{
-+ uint16x4_t res64;
-+ return64(vrshrq_n_u16(_pM128i(a), b));
-+}
-+
-+
-+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res64;
-+ return64(vrshrq_n_u32(_pM128i(a), b));
-+}
-+
-+
-+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
-+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
-+{
-+ uint64x1_t res64;
-+ return64(vrshrq_n_u64(_pM128i(a), b));
-+}
-+
-+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit trick
-+ __m128i r, mask1, maskb;
-+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
-+ r = vshrq_n_s8 (a, b);
-+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
-+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
-+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
-+ return _mm_add_epi8(r, maskb); //actual rounding
-+}
-+
-+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0
-+ r = _mm_srai_epi16 (a, b);
-+ return _mm_add_epi16 (r, maskb); //actual rounding
-+}
-+
-+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0
-+ r = _mm_srai_epi32(a, b);
-+ return _mm_add_epi32 (r, maskb); //actual rounding
-+}
-+
-+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-+{
-+ //solution may be not optimal compared with a serial one
-+ __m128i maskb;
-+ int64x2_t r;
-+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0
-+ r = vshrq_n_s64(a, b);
-+ return _mm_add_epi64 (r, maskb); //actual rounding
-+}
-+
-+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit trick
-+ __m128i r, mask1, maskb;
-+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
-+ r = vshrq_n_u8 (a, b);
-+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
-+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
-+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
-+ return _mm_add_epi8(r, maskb); //actual rounding
-+}
-+
-+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
-+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0
-+ r = _mm_srli_epi16 (a, b);
-+ return _mm_add_epi16 (r, maskb); //actual rounding
-+}
-+
-+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
-+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0
-+ r = _mm_srli_epi32(a, b);
-+ return _mm_add_epi32 (r, maskb); //actual rounding
-+}
-+
-+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
-+_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
-+{
-+ //solution may be not optimal compared with a serial one
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0
-+ r = _mm_srli_epi64(a, b);
-+ return _mm_add_epi64 (r, maskb); //actual rounding
-+}
-+
-+//************* Vector shift right by constant and accumulate *********
-+//*********************************************************************
-+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
-+{
-+ int8x8_t shift;
-+ shift = vshr_n_s8(b, c);
-+ return vadd_s8( a, shift);
-+}
-+
-+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
-+{
-+ int16x4_t shift;
-+ shift = vshr_n_s16( b, c);
-+ return vadd_s16(a, shift);
-+}
-+
-+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ int32x2_t shift;
-+ shift = vshr_n_s32(b, c);
-+ return vadd_s32( a, shift);
-+}
-+
-+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
-+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
-+{
-+ //may be not optimal compared with a serial solution
-+ int64x1_t shift;
-+ shift = vshr_n_s64(b, c);
-+ return vadd_s64( a, shift);
-+}
-+
-+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
-+{
-+ uint8x8_t shift;
-+ shift = vshr_n_u8(b, c);
-+ return vadd_u8(a, shift);
-+}
-+
-+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
-+{
-+ uint16x4_t shift;
-+ shift = vshr_n_u16(b, c);
-+ return vadd_u16(a,shift);
-+}
-+
-+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ uint32x2_t shift;
-+ shift = vshr_n_u32(b, c);
-+ return vadd_u32( a, shift);
-+}
-+
-+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
-+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
-+{
-+ //may be not optimal compared with the serial execution
-+ uint64x1_t shift;
-+ shift = vshr_n_u64(b, c);
-+ return vadd_u64(a, shift);
-+}
-+
-+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
-+{
-+ int8x16_t shift;
-+ shift = vshrq_n_s8(b, c);
-+ return vaddq_s8(a, shift);
-+}
-+
-+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
-+{
-+ int16x8_t shift;
-+ shift = vshrq_n_s16(b, c);
-+ return vaddq_s16(a, shift);
-+}
-+
-+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
-+{
-+ int32x4_t shift;
-+ shift = vshrq_n_s32(b, c);
-+ return vaddq_s32(a, shift);
-+}
-+
-+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
-+{
-+ int64x2_t shift;
-+ shift = vshrq_n_s64(b, c);
-+ return vaddq_s64( a, shift);
-+}
-+
-+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
-+{
-+ uint8x16_t shift;
-+ shift = vshrq_n_u8(b, c);
-+ return vaddq_u8(a, shift);
-+}
-+
-+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
-+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
-+{
-+ uint16x8_t shift;
-+ shift = vshrq_n_u16(b, c);
-+ return vaddq_u16(a, shift);
-+}
-+
-+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
-+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
-+{
-+ uint32x4_t shift;
-+ shift = vshrq_n_u32(b, c);
-+ return vaddq_u32(a, shift);
-+}
-+
-+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
-+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
-+{
-+ uint64x2_t shift;
-+ shift = vshrq_n_u64(b, c);
-+ return vaddq_u64(a, shift);
-+}
-+
-+//************* Vector rounding shift right by constant and accumulate ****************************
-+//************************************************************************************************
-+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
-+{
-+ int8x8_t shift;
-+ shift = vrshr_n_s8(b, c);
-+ return vadd_s8( a, shift);
-+}
-+
-+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
-+{
-+ int16x4_t shift;
-+ shift = vrshr_n_s16( b, c);
-+ return vadd_s16(a, shift);
-+}
-+
-+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ int32x2_t shift;
-+ shift = vrshr_n_s32(b, c);
-+ return vadd_s32( a, shift);
-+}
-+
-+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
-+{
-+ int64x1_t shift;
-+ shift = vrshr_n_s64(b, c);
-+ return vadd_s64( a, shift);
-+}
-+
-+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
-+{
-+ uint8x8_t shift;
-+ shift = vrshr_n_u8(b, c);
-+ return vadd_u8(a, shift);
-+}
-+
-+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
-+{
-+ uint16x4_t shift;
-+ shift = vrshr_n_u16(b, c);
-+ return vadd_u16(a,shift);
-+}
-+
-+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ uint32x2_t shift;
-+ shift = vrshr_n_u32(b, c);
-+ return vadd_u32( a, shift);
-+}
-+
-+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
-+{
-+ //may be not optimal compared with the serial execution
-+ uint64x1_t shift;
-+ shift = vrshr_n_u64(b, c);
-+ return vadd_u64( a, shift);
-+}
-+
-+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
-+{
-+ int8x16_t shift;
-+ shift = vrshrq_n_s8(b, c);
-+ return vaddq_s8(a, shift);
-+}
-+
-+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
-+{
-+ int16x8_t shift;
-+ shift = vrshrq_n_s16(b, c);
-+ return vaddq_s16(a, shift);
-+}
-+
-+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
-+{
-+ int32x4_t shift;
-+ shift = vrshrq_n_s32(b, c);
-+ return vaddq_s32(a, shift);
-+}
-+
-+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-+{
-+ int64x2_t shift;
-+ shift = vrshrq_n_s64(b, c);
-+ return vaddq_s64(a, shift);
-+}
-+
-+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
-+{
-+ uint8x16_t shift;
-+ shift = vrshrq_n_u8(b, c);
-+ return vaddq_u8(a, shift);
-+}
-+
-+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
-+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
-+{
-+ uint16x8_t shift;
-+ shift = vrshrq_n_u16(b, c);
-+ return vaddq_u16(a, shift);
-+}
-+
-+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
-+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
-+{
-+ uint32x4_t shift;
-+ shift = vrshrq_n_u32(b, c);
-+ return vaddq_u32(a, shift);
-+}
-+
-+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
-+_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
-+{
-+ uint64x2_t shift;
-+ shift = vrshrq_n_u64(b, c);
-+ return vaddq_u64(a, shift);
-+}
-+
-+//**********************Vector saturating shift left by constant *****************************
-+//********************************************************************************************
-+//we don't check const ranges assuming they are met
-+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
-+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
-+{
-+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
-+ int8x8_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi16 (a128, b);
-+ r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
-+ return64(r128);
-+}
-+
-+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
-+{
-+ // go to 32 bit to get the auto saturation (in packs function)
-+ int16x4_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi32 (a128, b); //shift_res
-+ r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
-+ return64(r128);
-+}
-+
-+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b)
-+{
-+ //serial execution may be faster
-+ int32x2_t res64;
-+ return64(vqshlq_n_s32 (_pM128i(a), b));
-+}
-+
-+
-+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ int64x1_t res;
-+ int64_t bmask;
-+ int64_t a_i64 = *( int64_t*)&a;
-+ bmask = ( int64_t)1 << (63 - b); //positive
-+ if (a_i64 >= bmask) {
-+ res.m64_i64[0] = ~(_SIGNBIT64);
-+ } else {
-+ res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
-+ }
-+ return res;
-+}
-+
-+
-+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
-+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
-+{
-+ //no 8 bit shift available in IA32 SIMD, go to 16 bit
-+ uint8x8_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi16 (a128, b); //shift_res
-+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
-+ return64(r128);
-+}
-+
-+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
-+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
-+{
-+ // go to 32 bit to get the auto saturation (in packus function)
-+ uint16x4_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi32 (a128, b); //shift_res
-+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
-+ return64(r128);
-+}
-+
-+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
-+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b)
-+{
-+ uint32x2_t res64;
-+ return64(vqshlq_n_u32(_pM128i(a), b));
-+}
-+
-+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ uint64x1_t res;
-+ uint64_t bmask;
-+ uint64_t a_i64 = *(uint64_t*)&a;
-+ bmask = ( uint64_t)1 << (64 - b);
-+ res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
-+ return res;
-+}
-+
-+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
-+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
-+{
-+ // go to 16 bit to get the auto saturation (in packs function)
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi16 (a128, b);
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI8_EPI16 (a128);
-+ r128_2 = _mm_slli_epi16 (a128, b);
-+ return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
-+}
-+
-+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
-+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
-+{
-+ // manual saturation solution looks LESS optimal than 32 bits conversion one
-+ // go to 32 bit to get the auto saturation (in packs function)
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI16_EPI32 (a128);
-+ r128_2 = _mm_slli_epi32 (a128, b);
-+ return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
-+}
-+
-+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
-+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
-+{
-+ // no 64 bit saturation option available, special tricks necessary
-+ __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
-+ c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
-+ maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
-+ saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise
-+ c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
-+ shift_res = _mm_slli_epi32 (a, b);
-+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
-+ //result with positive numbers saturated
-+ shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
-+ //treat negative numbers
-+ maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros
-+ saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise
-+ c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
-+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
-+ return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
-+}
-+
-+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
-+ int64_t bmask;
-+ int i;
-+ bmask = ( int64_t)1 << (63 - b); //positive
-+ _mm_store_si128((__m128i*)atmp, a);
-+ for (i = 0; i<2; i++) {
-+ if (atmp[i] >= bmask) {
-+ res[i] = ~(_SIGNBIT64);
-+ } else {
-+ res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
-+ }
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
-+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
-+{
-+ // go to 16 bit to get the auto saturation (in packs function)
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi16 (a128, b);
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPU8_EPI16 (a128);
-+ r128_2 = _mm_slli_epi16 (a128, b);
-+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
-+}
-+
-+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
-+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
-+{
-+ // manual saturation solution looks more optimal than 32 bits conversion one
-+ __m128i cb, c8000, a_signed, saturation_mask, shift_res;
-+ cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
-+ c8000 = _mm_set1_epi16 (0x8000);
-+//no unsigned shorts comparison in SSE, only signed available, so need the trick
-+ a_signed = _mm_sub_epi16(a, c8000); //go to signed
-+ saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
-+ shift_res = _mm_slli_epi16 (a, b);
-+ return _mm_or_si128 (shift_res, saturation_mask);
-+}
-+
-+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
-+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
-+{
-+ // manual saturation solution, no 64 bit saturation option, the serial version may be faster
-+ __m128i cb, c80000000, a_signed, saturation_mask, shift_res;
-+ cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+//no unsigned ints comparison in SSE, only signed available, so need the trick
-+ a_signed = _mm_sub_epi32(a, c80000000); //go to signed
-+ saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
-+ shift_res = _mm_slli_epi32 (a, b);
-+ return _mm_or_si128 (shift_res, saturation_mask);
-+}
-+
-+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
-+ uint64_t bmask;
-+ int i;
-+ bmask = ( uint64_t)1 << (64 - b);
-+ _mm_store_si128((__m128i*)atmp, a);
-+ for (i = 0; i<2; i++) {
-+ res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//**************Vector signed->unsigned saturating shift left by constant *************
-+//*************************************************************************************
-+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
-+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
-+{
-+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
-+ uint8x8_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi16 (a128, b);
-+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
-+ return64(r128);
-+}
-+
-+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
-+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
-+{
-+ uint16x4_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi32 (a128, b); //shift_res
-+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
-+ return64(r128);
-+}
-+
-+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b)
-+{
-+ int32x2_t res64;
-+ return64( vqshluq_n_s32(_pM128i(a), b));
-+}
-+
-+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
-+{
-+ uint64x1_t res;
-+ uint64_t limit;
-+ if (a.m64_i64[0]<=0) {
-+ res.m64_u64[0] = 0;
-+ } else {
-+ limit = (uint64_t) 1 << (64 - b);
-+ res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
-+ }
-+ return res;
-+}
-+
-+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
-+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
-+{
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi16 (a128, b);
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI8_EPI16 (a128);
-+ r128_2 = _mm_slli_epi16 (a128, b);
-+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
-+}
-+
-+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
-+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
-+{
-+ // manual saturation solution looks LESS optimal than 32 bits conversion one
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI16_EPI32 (a128);
-+ r128_2 = _mm_slli_epi32 (a128, b);
-+ return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
-+}
-+
-+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
-+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
-+{
-+ //solution may be not optimal compared with the serial one
-+ __m128i zero, maskA, maskGT0, a0, a_masked, a_shift;
-+ zero = _mm_setzero_si128();
-+ maskA = _mm_cmpeq_epi32(a, a);
-+ maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
-+ //saturate negative numbers to zero
-+ maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
-+ a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now
-+ //saturate positive to 0xffffffff
-+ a_masked = _mm_and_si128 (a0, maskA);
-+ a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
-+ a_shift = _mm_slli_epi32 (a0, b);
-+ return _mm_or_si128 (a_shift, a_masked); //actual saturation
-+}
-+
-+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here, serial execution looks faster
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ uint64_t limit;
-+ int i;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ for (i = 0; i<2; i++) {
-+ if (atmp[i]<=0) {
-+ res[i] = 0;
-+ } else {
-+ limit = (uint64_t) 1 << (64 - b);
-+ res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
-+ }
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//************** Vector narrowing shift right by constant **************
-+//**********************************************************************
-+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r16 = vshrq_n_s16(a,b);
-+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r16);
-+}
-+
-+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r32 = vshrq_n_s32(a,b);
-+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r32);
-+}
-+
-+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ __m128i r64;
-+ r64 = vshrq_n_s64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i mask, r16;
-+ mask = _mm_set1_epi16(0xff);
-+ r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_and_si128(r16, mask); //to avoid saturation
-+ r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i mask, r32;
-+ mask = _mm_set1_epi32(0xffff);
-+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
-+ r32 = _mm_and_si128(r32, mask); //to avoid saturation
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res64;
-+ __m128i r64;
-+ r64 = vshrq_n_u64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+//************** Vector signed->unsigned narrowing saturating shift right by constant ********
-+//*********************************************************************************************
-+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i r16;
-+ r16 = vshrq_n_s16(a,b);
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i r32;
-+ r32 = vshrq_n_s32(a,b);
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
-+{
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ uint32x2_t res;
-+ int64_t res64;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ if (atmp[0] < 0) {
-+ res.m64_u32[0] = 0;
-+ } else {
-+ res64 = (atmp[0] >> b);
-+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
-+ }
-+ if (atmp[1] < 0) {
-+ res.m64_u32[1] = 0;
-+ } else {
-+ res64 = (atmp[1] >> b);
-+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
-+ }
-+ return res;
-+}
-+
-+//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
-+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
-+{
-+ //solution may be not optimal compared with the serial one
-+ __m128i r16;
-+ uint8x8_t res64;
-+ r16 = vrshrq_n_s16(a,b);
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
-+{
-+ //solution may be not optimal compared with the serial one
-+ __m128i r32;
-+ uint16x4_t res64;
-+ r32 = vrshrq_n_s32(a,b);
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
-+{
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ uint32x2_t res;
-+ int64_t res64;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ if (atmp[0] < 0) {
-+ res.m64_u32[0] = 0;
-+ } else {
-+ res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
-+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
-+ }
-+ if (atmp[1] < 0) {
-+ res.m64_u32[1] = 0;
-+ } else {
-+ res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
-+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
-+ }
-+ return res;
-+}
-+
-+//***** Vector narrowing saturating shift right by constant ******
-+//*****************************************************************
-+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ r16 = vshrq_n_s16(a,b);
-+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ r32 = vshrq_n_s32(a,b);
-+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //no optimal SIMD solution found
-+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
-+ int32x2_t res;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res64[0] = (atmp[0] >> b);
-+ res64[1] = (atmp[1] >> b);
-+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
-+ if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
-+ if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
-+ if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
-+ res.m64_i32[0] = (int32_t)res64[0];
-+ res.m64_i32[1] = (int32_t)res64[1];
-+ return res;
-+}
-+
-+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i r16;
-+ r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i r32;
-+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-+{
-+ //serial solution may be faster
-+ uint32x2_t res64;
-+ __m128i r64, res_hi, zero;
-+ zero = _mm_setzero_si128();
-+ r64 = vshrq_n_u64(a,b);
-+ res_hi = _mm_srli_epi64(r64, 32);
-+ res_hi = _mm_cmpgt_epi32(res_hi, zero);
-+ r64 = _mm_or_si128(r64, res_hi);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+
-+//********* Vector rounding narrowing shift right by constant *************************
-+//****************************************************************************************
-+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r16 = vrshrq_n_s16(a,b);
-+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r16);
-+}
-+
-+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r32 = vrshrq_n_s32(a,b);
-+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r32);
-+}
-+
-+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ __m128i r64;
-+ r64 = vrshrq_n_s64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i mask, r16;
-+ mask = _mm_set1_epi16(0xff);
-+ r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_and_si128(r16, mask); //to avoid saturation
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i mask, r32;
-+ mask = _mm_set1_epi32(0xffff);
-+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-+ r32 = _mm_and_si128(r32, mask); //to avoid saturation
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
-+{
-+ uint32x2_t res64;
-+ __m128i r64;
-+ r64 = vrshrq_n_u64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+//************* Vector rounding narrowing saturating shift right by constant ************
-+//****************************************************************************************
-+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ r16 = vrshrq_n_s16(a,b);
-+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ r32 = vrshrq_n_s32(a,b);
-+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //no optimal SIMD solution found
-+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
-+ int32x2_t res;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
-+ res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
-+ maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
-+ res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
-+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
-+ if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
-+ if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
-+ if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
-+ res.m64_i32[0] = (int32_t)res64[0];
-+ res.m64_i32[1] = (int32_t)res64[1];
-+ return res;
-+}
-+
-+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i r16;
-+ r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i r32;
-+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-+{
-+ //serial solution may be faster
-+ uint32x2_t res64;
-+ __m128i r64, res_hi, zero;
-+ zero = _mm_setzero_si128();
-+ r64 = vrshrq_n_u64(a,b);
-+ res_hi = _mm_srli_epi64(r64, 32);
-+ res_hi = _mm_cmpgt_epi32(res_hi, zero);
-+ r64 = _mm_or_si128(r64, res_hi);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+//************** Vector widening shift left by constant ****************
-+//************************************************************************
-+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
-+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
-+{
-+ __m128i r;
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ return _mm_slli_epi16 (r, b);
-+}
-+
-+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
-+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
-+{
-+ __m128i r;
-+ r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
-+ return _mm_slli_epi32 (r, b);
-+}
-+
-+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
-+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
-+{
-+ __m128i r;
-+ r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
-+ return _mm_slli_epi64 (r, b);
-+}
-+
-+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
-+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
-+{
-+ //no uint8 to uint16 conversion available, manual conversion used
-+ __m128i zero, r;
-+ zero = _mm_setzero_si128 ();
-+ r = _mm_unpacklo_epi8(_pM128i(a), zero);
-+ return _mm_slli_epi16 (r, b);
-+}
-+
-+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
-+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
-+{
-+ //no uint16 to uint32 conversion available, manual conversion used
-+ __m128i zero, r;
-+ zero = _mm_setzero_si128 ();
-+ r = _mm_unpacklo_epi16(_pM128i(a), zero);
-+ return _mm_slli_epi32 (r, b);
-+}
-+
-+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
-+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
-+{
-+ //no uint32 to uint64 conversion available, manual conversion used
-+ __m128i zero, r;
-+ zero = _mm_setzero_si128 ();
-+ r = _mm_unpacklo_epi32(_pM128i(a), zero);
-+ return _mm_slli_epi64 (r, b);
-+}
-+
-+//************************************************************************************
-+//**************************** Shifts with insert ************************************
-+//************************************************************************************
-+//takes each element in a vector, shifts them by an immediate value,
-+//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
-+
-+//**************** Vector shift right and insert ************************************
-+//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
-+//All other bits are taken from b shifted.
-+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c)
-+{
-+ int8x8_t res64;
-+ return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c)
-+{
-+ int16x4_t res64;
-+ return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c)
-+{
-+ int32x2_t res64;
-+ return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
-+{
-+ int64x1_t res;
-+ if (c ==64)
-+ res = a;
-+ else{
-+ res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+#define vsri_n_u8 vsri_n_s8
-+
-+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+#define vsri_n_u16 vsri_n_s16
-+
-+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+#define vsri_n_u32 vsri_n_s32
-+
-+
-+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+#define vsri_n_u64 vsri_n_s64
-+
-+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+#define vsri_n_p8 vsri_n_u8
-+
-+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+#define vsri_n_p16 vsri_n_u16
-+
-+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
-+{
-+ __m128i maskA, a_masked;
-+ uint8x16_t b_shift;
-+ _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
-+ maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
-+ a_masked = _mm_and_si128 (a, maskA);
-+ b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
-+ return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
-+}
-+
-+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
-+{
-+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
-+ uint16x8_t b_shift;
-+ uint16x8_t a_c;
-+ b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
-+ a_c = vshrq_n_u16( a, (16 - c));
-+ a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
-+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-+}
-+
-+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
-+{
-+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
-+ uint32x4_t b_shift;
-+ uint32x4_t a_c;
-+ b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
-+ a_c = vshrq_n_u32( a, (32 - c));
-+ a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
-+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-+}
-+
-+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-+{
-+ //serial solution may be faster
-+ uint64x2_t b_shift;
-+ uint64x2_t a_c;
-+ b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
-+ a_c = _mm_srli_epi64(a, (64 - c));
-+ a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
-+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-+}
-+
-+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+#define vsriq_n_u8 vsriq_n_s8
-+
-+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+#define vsriq_n_u16 vsriq_n_s16
-+
-+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+#define vsriq_n_u32 vsriq_n_s32
-+
-+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+#define vsriq_n_u64 vsriq_n_s64
-+
-+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+#define vsriq_n_p8 vsriq_n_u8
-+
-+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+#define vsriq_n_p16 vsriq_n_u16
-+
-+//***** Vector shift left and insert *********************************************
-+//*********************************************************************************
-+//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
-+//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
-+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c)
-+{
-+ int8x8_t res64;
-+ return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c)
-+{
-+ int16x4_t res64;
-+ return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c)
-+{
-+ int32x2_t res64;
-+ return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
-+}
-+
-+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
-+ return res;
-+}
-+
-+
-+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+#define vsli_n_u8 vsli_n_s8
-+
-+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+#define vsli_n_u16 vsli_n_s16
-+
-+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+#define vsli_n_u32 vsli_n_s32
-+
-+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+#define vsli_n_u64 vsli_n_s64
-+
-+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+#define vsli_n_p8 vsli_n_u8
-+
-+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+#define vsli_n_p16 vsli_n_u16
-+
-+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
-+{
-+ __m128i maskA, a_masked;
-+ int8x16_t b_shift;
-+ _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
-+ maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
-+ b_shift = vshlq_n_s8( b, c);
-+ a_masked = _mm_and_si128 (a, maskA);
-+ return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
-+}
-+
-+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
-+{
-+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
-+ int16x8_t b_shift;
-+ int16x8_t a_c;
-+ b_shift = vshlq_n_s16( b, c);
-+ a_c = vshlq_n_s16( a, (16 - c));
-+ a_c = _mm_srli_epi16(a_c, (16 - c));
-+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-+}
-+
-+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
-+{
-+ //solution may be not optimal compared with the serial one
-+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
-+ int32x4_t b_shift;
-+ int32x4_t a_c;
-+ b_shift = vshlq_n_s32( b, c);
-+ a_c = vshlq_n_s32( a, (32 - c));
-+ a_c = _mm_srli_epi32(a_c, (32 - c));
-+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-+}
-+
-+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
-+{
-+ //solution may be not optimal compared with the serial one
-+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
-+ int64x2_t b_shift;
-+ int64x2_t a_c;
-+ b_shift = vshlq_n_s64( b, c);
-+ a_c = vshlq_n_s64( a, (64 - c));
-+ a_c = _mm_srli_epi64(a_c, (64 - c));
-+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-+}
-+
-+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+#define vsliq_n_u8 vsliq_n_s8
-+
-+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+#define vsliq_n_u16 vsliq_n_s16
-+
-+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+#define vsliq_n_u32 vsliq_n_s32
-+
-+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+#define vsliq_n_u64 vsliq_n_s64
-+
-+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+#define vsliq_n_p8 vsliq_n_u8
-+
-+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+#define vsliq_n_p16 vsliq_n_u16
-+
-+// ***********************************************************************************************
-+// ****************** Loads and stores of a single vector ***************************************
-+// ***********************************************************************************************
-+//Performs loads and stores of a single vector of some type.
-+//******************************* Loads ********************************************************
-+// ***********************************************************************************************
-+//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
-+//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
-+// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
-+//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
-+#define LOAD_SI128(ptr) \
-+ ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
-+
-+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+#define vld1q_u8 LOAD_SI128
-+
-+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+#define vld1q_u16 LOAD_SI128
-+
-+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+#define vld1q_u32 LOAD_SI128
-+
-+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld1q_u64 LOAD_SI128
-+
-+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+#define vld1q_s8 LOAD_SI128
-+
-+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+#define vld1q_s16 LOAD_SI128
-+
-+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+#define vld1q_s32 LOAD_SI128
-+
-+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld1q_s64 LOAD_SI128
-+
-+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
-+/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
-+{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
-+__m128 f2;
-+f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
-+}*/
-+
-+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
-+{
-+ if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned
-+ return _mm_load_ps(ptr);
-+ else
-+ return _mm_loadu_ps(ptr);
-+}
-+
-+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+#define vld1q_p8 LOAD_SI128
-+
-+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+#define vld1q_p16 LOAD_SI128
-+
-+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
-+#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
-+
-+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
-+#define vld1_u16 vld1_u8
-+
-+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
-+#define vld1_u32 vld1_u8
-+
-+
-+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1_u64 vld1_u8
-+
-+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
-+#define vld1_s8 vld1_u8
-+
-+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
-+#define vld1_s16 vld1_u16
-+
-+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
-+#define vld1_s32 vld1_u32
-+
-+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1_s64 vld1_u64
-+
-+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
-+
-+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
-+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = *(ptr);
-+ res.m64_f32[1] = *(ptr + 1);
-+ return res;
-+}
-+
-+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
-+#define vld1_p8 vld1_u8
-+
-+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
-+#define vld1_p16 vld1_u16
-+
-+//***********************************************************************************************************
-+//******* Lane load functions - insert the data at vector's given position (lane) *************************
-+//***********************************************************************************************************
-+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-+
-+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-+
-+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
-+
-+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
-+
-+
-+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-+
-+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-+
-+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
-+
-+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
-+{
-+ //we need to deal with ptr 16bit NOT aligned case
-+ __m128 p;
-+ p = _mm_set1_ps(*(ptr));
-+ return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane));
-+}
-+
-+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-+#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
-+
-+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-+
-+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-+
-+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint8x8_t res;
-+ res = vec;
-+ res.m64_u8[lane] = *(ptr);
-+ return res;
-+}
-+
-+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint16x4_t res;
-+ res = vec;
-+ res.m64_u16[lane] = *(ptr);
-+ return res;
-+}
-+
-+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint32x2_t res;
-+ res = vec;
-+ res.m64_u32[lane] = *(ptr);
-+ return res;
-+}
-+
-+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
-+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = *(ptr);
-+ return res;
-+}
-+
-+
-+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
-+
-+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
-+
-+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
-+
-+float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
-+{
-+ float32x2_t res;
-+ res = vec;
-+ res.m64_f32[lane] = *(ptr);
-+ return res;
-+}
-+
-+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
-+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
-+
-+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1_lane_p8 vld1_lane_u8
-+
-+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1_lane_p16 vld1_lane_s16
-+
-+// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
-+// ******************************************************************************************************************
-+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
-+
-+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
-+
-+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
-+
-+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
-+ return LOAD_SI128(val);
-+}
-+
-+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
-+
-+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
-+
-+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
-+
-+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
-+
-+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+//current IA SIMD doesn't support float16, need to go to 32 bits
-+
-+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
-+
-+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
-+
-+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
-+
-+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint8x8_t res;
-+ int i;
-+ for(i = 0; i<8; i++) {
-+ res.m64_u8[i] = *(ptr);
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint16x4_t res;
-+ int i;
-+ for(i = 0; i<4; i++) {
-+ res.m64_u16[i] = *(ptr);
-+ }
-+ return res;
-+}
-+
-+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = *(ptr);
-+ res.m64_u32[1] = *(ptr);
-+ return res;
-+}
-+
-+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = *(ptr);
-+ return res;
-+}
-+
-+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
-+
-+
-+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
-+
-+
-+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
-+
-+
-+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
-+
-+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = *(ptr);
-+ res.m64_f32[1] = res.m64_f32[0];
-+ return res; // use last 64bits only
-+}
-+
-+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1_dup_p8 vld1_dup_u8
-+
-+
-+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1_dup_p16 vld1_dup_u16
-+
-+
-+//*************************************************************************************
-+//********************************* Store **********************************************
-+//*************************************************************************************
-+// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
-+//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
-+#define STORE_SI128(ptr, val) \
-+ (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
-+
-+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
-+#define vst1q_u8 STORE_SI128
-+
-+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
-+#define vst1q_u16 STORE_SI128
-+
-+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
-+#define vst1q_u32 STORE_SI128
-+
-+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
-+#define vst1q_u64 STORE_SI128
-+
-+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
-+#define vst1q_s8 STORE_SI128
-+
-+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
-+#define vst1q_s16 STORE_SI128
-+
-+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
-+#define vst1q_s32 STORE_SI128
-+
-+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
-+#define vst1q_s64 STORE_SI128
-+
-+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
-+{
-+ if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned
-+ _mm_store_ps (ptr, val);
-+ else
-+ _mm_storeu_ps (ptr, val);
-+}
-+
-+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
-+#define vst1q_p8 vst1q_u8
-+
-+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
-+#define vst1q_p16 vst1q_u16
-+
-+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
-+{
-+ int i;
-+ for (i = 0; i<8; i++) {
-+ *(ptr + i) = ((uint8_t*)&val)[i];
-+ }
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
-+{
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ *(ptr + i) = ((uint16_t*)&val)[i];
-+ }
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
-+{
-+ int i;
-+ for (i = 0; i<2; i++) {
-+ *(ptr + i) = ((uint32_t*)&val)[i];
-+ }
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
-+{
-+ *(ptr) = *((uint64_t*)&val);
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
-+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
-+
-+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
-+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
-+
-+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
-+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
-+
-+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
-+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
-+
-+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
-+{
-+ *(ptr) = val.m64_f32[0];
-+ *(ptr + 1) = val.m64_f32[1];
-+ return;
-+}
-+
-+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
-+#define vst1_p8 vst1_u8
-+
-+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
-+#define vst1_p16 vst1_u16
-+
-+//***********Store a lane of a vector into memory (extract given lane) *********************
-+//******************************************************************************************
-+void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
-+
-+void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
-+
-+void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-+#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
-+
-+void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
-+#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
-+
-+void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
-+
-+void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
-+
-+void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-+#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
-+
-+void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
-+#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
-+
-+void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
-+{
-+ int32_t ilane;
-+ ilane = _MM_EXTRACT_PS(val,lane);
-+ *(ptr) = *((float*)&ilane);
-+}
-+
-+void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1q_lane_p8 vst1q_lane_u8
-+
-+void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1q_lane_p16 vst1q_lane_s16
-+
-+void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
-+{
-+ *(ptr) = val.m64_u8[lane];
-+}
-+
-+void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val.m64_u16[lane];
-+}
-+
-+void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val.m64_u32[lane];
-+}
-+
-+void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
-+{
-+ *(ptr) = val.m64_u64[0];
-+}
-+
-+void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
-+
-+void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
-+
-+void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-+#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
-+
-+
-+void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
-+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
-+
-+
-+void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val.m64_f32[lane];
-+}
-+
-+void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1_lane_p8 vst1_lane_u8
-+
-+void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1_lane_p16 vst1_lane_s16
-+
-+//***********************************************************************************************
-+//**************** Loads and stores of an N-element structure **********************************
-+//***********************************************************************************************
-+//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
-+//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
-+//****************** 2 elements load *********************************************
-+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
-+{
-+ uint8x16x2_t v;
-+ v.val[0] = vld1q_u8(ptr);
-+ v.val[1] = vld1q_u8((ptr + 16));
-+ v = vuzpq_s8(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
-+{
-+ uint16x8x2_t v;
-+ v.val[0] = vld1q_u16( ptr);
-+ v.val[1] = vld1q_u16( (ptr + 8));
-+ v = vuzpq_s16(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
-+{
-+ uint32x4x2_t v;
-+ v.val[0] = vld1q_u32 ( ptr);
-+ v.val[1] = vld1q_u32 ( (ptr + 4));
-+ v = vuzpq_s32(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
-+#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
-+
-+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
-+
-+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
-+
-+
-+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
-+{
-+ float32x4x2_t v;
-+ v.val[0] = vld1q_f32 (ptr);
-+ v.val[1] = vld1q_f32 ((ptr + 4));
-+ v = vuzpq_f32(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+#define vld2q_p8 vld2q_u8
-+
-+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+#define vld2q_p16 vld2q_u16
-+
-+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
-+{
-+ uint8x8x2_t v;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-+ __m128i ld128;
-+ ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
-+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
-+ vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
-+ return v;
-+}
-+
-+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
-+{
-+ _NEON2SSE_ALIGN_16 uint16x4x2_t v;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-+ __m128i ld128;
-+ ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
-+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
-+ vst1q_u16((v.val), ld128);
-+ return v;
-+}
-+
-+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
-+{
-+ _NEON2SSE_ALIGN_16 uint32x2x2_t v;
-+ __m128i ld128;
-+ ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
-+ ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ vst1q_u32((v.val), ld128);
-+ return v;
-+}
-+
-+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
-+{
-+ uint64x1x2_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ return v;
-+}
-+
-+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
-+
-+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
-+
-+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
-+
-+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
-+
-+float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
-+
-+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
-+{
-+ float32x2x2_t v;
-+ v.val[0].m64_f32[0] = *(ptr);
-+ v.val[0].m64_f32[1] = *(ptr + 2);
-+ v.val[1].m64_f32[0] = *(ptr + 1);
-+ v.val[1].m64_f32[1] = *(ptr + 3);
-+ return v;
-+}
-+
-+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+#define vld2_p8 vld2_u8
-+
-+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+#define vld2_p16 vld2_u16
-+
-+//******************** Triplets ***************************************
-+//*********************************************************************
-+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
-+{
-+ //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
-+ //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
-+ //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
-+ //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
-+ uint8x16x3_t v;
-+ __m128i tmp0, tmp1,tmp2, tmp3;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
-+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
-+ _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
-+
-+ v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
-+ v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
-+ v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
-+
-+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
-+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
-+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
-+
-+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
-+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
-+ tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
-+ tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
-+ v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
-+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
-+
-+ tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
-+ tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
-+ v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
-+ v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
-+ v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
-+ v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
-+ tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
-+ tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
-+
-+ tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
-+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
-+ v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
-+ v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
-+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
-+ tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
-+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
-+ return v;
-+}
-+
-+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
-+{
-+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
-+ uint16x8x3_t v;
-+ __m128i tmp0, tmp1,tmp2, tmp3;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
-+ _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
-+ _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
-+
-+ v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
-+ v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
-+ v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
-+
-+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
-+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
-+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
-+
-+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
-+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
-+ tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
-+ tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
-+ v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
-+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
-+
-+ tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
-+ tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
-+ v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
-+ v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
-+ v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
-+ v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
-+ tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
-+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
-+
-+ tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
-+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
-+ v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
-+ v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
-+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
-+ tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
-+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
-+ return v;
-+}
-+
-+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
-+ uint32x4x3_t v;
-+ __m128i tmp0, tmp1,tmp2, tmp3;
-+ v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3,
-+ v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
-+ v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
-+
-+ tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
-+ tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
-+ tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
-+
-+ tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
-+ v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
-+ tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
-+ v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
-+ v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
-+ v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
-+ return v;
-+}
-+
-+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
-+
-+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
-+
-+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
-+
-+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
-+ float32x4x3_t v;
-+ __m128 tmp0, tmp1,tmp2, tmp3;
-+ v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3,
-+ v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
-+ v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
-+
-+ tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
-+ tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
-+ tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
-+ tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
-+
-+ v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
-+ tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
-+ v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
-+ v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
-+ v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
-+ return v;
-+}
-+
-+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+#define vld3q_p8 vld3q_u8
-+
-+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+#define vld3q_p16 vld3q_u16
-+
-+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
-+{
-+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
-+ uint8x8x3_t v;
-+ __m128i val0, val1, val2, tmp0, tmp1;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
-+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
-+ val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
-+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
-+
-+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
-+ tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
-+ val0 = _mm_slli_si128(tmp0,10);
-+ val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
-+ val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
-+ val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
-+ _M64(v.val[0], val0);
-+ val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
-+ val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
-+ val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
-+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
-+ val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
-+ _M64(v.val[1], val1);
-+
-+ tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
-+ val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
-+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
-+ val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
-+ uint16x4x3_t v;
-+ __m128i val0, val1, val2, tmp0, tmp1;
-+ _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
-+ val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3
-+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
-+
-+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
-+ tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3,
-+ val0 = _mm_slli_si128(tmp0,10);
-+ val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
-+ val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
-+ val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
-+ val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
-+ _M64(v.val[0], val0);
-+
-+ val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
-+ val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
-+ val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
-+ val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
-+ val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
-+ _M64(v.val[1], val1);
-+
-+ tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
-+ tmp1 = _mm_srli_si128(tmp1,4);
-+ tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
-+ val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
-+{
-+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
-+ uint32x2x3_t v;
-+ __m128i val0, val1, val2;
-+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
-+ val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
-+
-+ val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
-+ _M64(v.val[0], val0);
-+ val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1,
-+ val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
-+ _M64(v.val[1], val1);
-+ val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x,
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
-+{
-+ uint64x1x3_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ v.val[2].m64_u64[0] = *(ptr + 2);
-+ return v;
-+}
-+
-+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
-+
-+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
-+
-+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
-+
-+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
-+
-+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
-+{
-+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
-+ float32x2x3_t v;
-+ v.val[0].m64_f32[0] = *(ptr);
-+ v.val[0].m64_f32[1] = *(ptr + 3);
-+
-+ v.val[1].m64_f32[0] = *(ptr + 1);
-+ v.val[1].m64_f32[1] = *(ptr + 4);
-+
-+ v.val[2].m64_f32[0] = *(ptr + 2);
-+ v.val[2].m64_f32[1] = *(ptr + 5);
-+ return v;
-+}
-+
-+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+#define vld3_p8 vld3_u8
-+
-+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+#define vld3_p16 vld3_u16
-+
-+//*************** Quadruples load ********************************
-+//*****************************************************************
-+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
-+{
-+ uint8x16x4_t v;
-+ __m128i tmp3, tmp2, tmp1, tmp0;
-+
-+ v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
-+ v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
-+ v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
-+ v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
-+
-+ tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
-+ tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
-+ tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
-+ tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
-+
-+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
-+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
-+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
-+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
-+
-+ tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
-+ tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
-+ tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
-+ tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
-+
-+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
-+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
-+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
-+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
-+ return v;
-+}
-+
-+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
-+{
-+ uint16x8x4_t v;
-+ __m128i tmp3, tmp2, tmp1, tmp0;
-+ tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7
-+ tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
-+ tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
-+ tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
-+ v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
-+ v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
-+ v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
-+ v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
-+ tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
-+ tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
-+ tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
-+ tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
-+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
-+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
-+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
-+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
-+ return v;
-+}
-+
-+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
-+{
-+ uint32x4x4_t v;
-+ __m128i tmp3, tmp2, tmp1, tmp0;
-+ v.val[0] = vld1q_u32 (ptr);
-+ v.val[1] = vld1q_u32 ((ptr + 4));
-+ v.val[2] = vld1q_u32 ((ptr + 8));
-+ v.val[3] = vld1q_u32 ((ptr + 12));
-+ tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
-+ tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
-+ tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
-+ tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
-+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
-+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
-+ v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
-+ v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
-+ return v;
-+}
-+
-+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
-+
-+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
-+
-+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
-+
-+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
-+{
-+ float32x4x4_t v;
-+ __m128 tmp3, tmp2, tmp1, tmp0;
-+
-+ v.val[0] = vld1q_f32 ((float*) ptr);
-+ v.val[1] = vld1q_f32 ((float*) (ptr + 4));
-+ v.val[2] = vld1q_f32 ((float*) (ptr + 8));
-+ v.val[3] = vld1q_f32 ((float*) (ptr + 12));
-+ tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
-+ tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
-+ tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
-+ tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
-+ v.val[0] = _mm_movelh_ps(tmp0, tmp2);
-+ v.val[1] = _mm_movehl_ps(tmp2, tmp0);
-+ v.val[2] = _mm_movelh_ps(tmp1, tmp3);
-+ v.val[3] = _mm_movehl_ps(tmp3, tmp1);
-+ return v;
-+}
-+
-+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+#define vld4q_p8 vld4q_u8
-+
-+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+#define vld4q_p16 vld4q_s16
-+
-+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
-+{
-+ uint8x8x4_t v;
-+ __m128i sh0, sh1;
-+ __m128i val0, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
-+
-+ val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
-+ val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
-+
-+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
-+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
-+ vst1q_u8(&v.val[0], val0 );
-+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
-+ vst1q_u8(&v.val[2], val2 );
-+ return v;
-+}
-+
-+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
-+{
-+ uint16x4x4_t v;
-+ __m128i sh0, sh1;
-+ __m128i val0, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
-+ val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
-+ val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
-+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
-+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
-+ vst1q_u16(&v.val[0], val0 );
-+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
-+ vst1q_u16(&v.val[2], val2 );
-+ return v;
-+}
-+
-+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
-+{
-+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-+ uint32x2x4_t v;
-+ __m128i val0, val01, val2;
-+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
-+ val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
-+ val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
-+ val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
-+ vst1q_u32(&v.val[0], val01);
-+ vst1q_u32(&v.val[2], val2 );
-+ return v;
-+}
-+
-+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
-+{
-+ uint64x1x4_t v;
-+ v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
-+ v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
-+ v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
-+ v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
-+ return v;
-+}
-+
-+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
-+
-+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
-+
-+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
-+
-+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
-+
-+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
-+{
-+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-+ float32x2x4_t res;
-+ res.val[0].m64_f32[0] = *(ptr);
-+ res.val[0].m64_f32[1] = *(ptr + 4);
-+ res.val[1].m64_f32[0] = *(ptr + 1);
-+ res.val[1].m64_f32[1] = *(ptr + 5);
-+ res.val[2].m64_f32[0] = *(ptr + 2);
-+ res.val[2].m64_f32[1] = *(ptr + 6);
-+ res.val[3].m64_f32[0] = *(ptr + 3);
-+ res.val[3].m64_f32[1] = *(ptr + 7);
-+ return res;
-+}
-+
-+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+#define vld4_p8 vld4_u8
-+
-+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+#define vld4_p16 vld4_u16
-+
-+//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
-+//*******************************************************************************************************************
-+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
-+{
-+ uint8x8x2_t v;
-+ __m128i val0, val1;
-+ val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
-+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
-+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
-+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-+ vst1q_u8(v.val, val0);
-+ return v;
-+}
-+
-+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
-+{
-+ uint16x4x2_t v;
-+ __m128i val0, val1;
-+ val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
-+ val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
-+ _M64(v.val[0], val0);
-+ val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
-+ _M64(v.val[1], val1);
-+ return v;
-+}
-+
-+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
-+{
-+ uint32x2x2_t v;
-+ __m128i val0;
-+ val0 = LOAD_SI128(ptr); //0,1,x,x
-+ val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
-+ vst1q_u32(v.val, val0);
-+ return v;
-+}
-+
-+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld2_dup_u64 vld2_u64
-+
-+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
-+
-+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
-+
-+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
-+
-+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
-+
-+float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
-+{
-+ float32x2x2_t v;
-+ v.val[0].m64_f32[0] = *(ptr); //0,0
-+ v.val[0].m64_f32[1] = *(ptr); //0,0
-+ v.val[1].m64_f32[0] = *(ptr + 1); //1,1
-+ v.val[1].m64_f32[1] = *(ptr + 1); //1,1
-+ return v;
-+}
-+
-+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+#define vld2_dup_p8 vld2_dup_u8
-+
-+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+#define vld2_dup_p16 vld2_dup_s16
-+
-+//************* Duplicate (or propagate)triplets: *******************
-+//********************************************************************
-+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
-+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+{
-+ uint8x8x3_t v;
-+ __m128i val0, val1, val2;
-+ val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
-+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
-+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
-+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
-+ vst1q_u8(v.val, val0);
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+{
-+ uint16x4x3_t v;
-+ __m128i val0, val1, val2;
-+ val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
-+ val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
-+ val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
-+ val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+{
-+ uint32x2x3_t v;
-+ __m128i val0, val1, val2;
-+ val2 = LOAD_SI128(ptr); //0,1,2,x
-+ val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
-+ val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
-+ val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
-+{
-+ uint64x1x3_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ v.val[2].m64_u64[0] = *(ptr + 2);
-+ return v;
-+}
-+
-+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
-+
-+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
-+
-+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
-+
-+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
-+
-+
-+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+{
-+ float32x2x3_t v;
-+ int i;
-+ for (i = 0; i<3; i++) {
-+ v.val[i].m64_f32[0] = *(ptr + i);
-+ v.val[i].m64_f32[1] = *(ptr + i);
-+ }
-+ return v;
-+}
-+
-+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_p8 vld3_dup_u8
-+
-+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_p16 vld3_dup_s16
-+
-+
-+//************* Duplicate (or propagate) quadruples: *******************
-+//***********************************************************************
-+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
-+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ uint8x8x4_t v;
-+ __m128i val0, val1, val2;
-+ val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
-+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
-+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
-+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
-+ vst1q_u8(&v.val[0], val0);
-+ vst1q_u8(&v.val[2], val2);
-+ return v;
-+}
-+
-+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ uint16x4x4_t v;
-+ __m128i val0, val1, val2, val3;
-+ val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
-+ val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
-+ val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
-+ val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
-+ val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ _M64(v.val[3], val3);
-+ return v;
-+}
-+
-+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ uint32x2x4_t v;
-+ __m128i val0, val1, val2, val3;
-+ val3 = LOAD_SI128(ptr); //0,1,2,3
-+ val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
-+ val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
-+ val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
-+ val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ _M64(v.val[3], val3);
-+ return v;
-+}
-+
-+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
-+{
-+ uint64x1x4_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ v.val[2].m64_u64[0] = *(ptr + 2);
-+ v.val[3].m64_u64[0] = *(ptr + 3);
-+ return v;
-+}
-+
-+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
-+
-+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
-+
-+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
-+
-+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
-+
-+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ float32x2x4_t v;
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ v.val[i].m64_f32[0] = *(ptr + i);
-+ v.val[i].m64_f32[1] = *(ptr + i);
-+ }
-+ return v;
-+}
-+
-+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_p8 vld4_dup_u8
-+
-+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_p16 vld4_dup_u16
-+
-+
-+//**********************************************************************************
-+//*******************Lane loads for an N-element structures ***********************
-+//**********************************************************************************
-+//********************** Lane pairs ************************************************
-+//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
-+//we assume src is 16 bit aligned
-+
-+//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
-+//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
-+
-+//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
-+{
-+ uint16x8x2_t v;
-+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
-+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
-+{
-+ uint32x4x2_t v;
-+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
-+
-+//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
-+{
-+ int16x8x2_t v;
-+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
-+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
-+
-+//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
-+{
-+ int32x4x2_t v;
-+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
-+
-+//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
-+{
-+ float32x4x2_t v;
-+ v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
-+
-+//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+#define vld2q_lane_p16 vld2q_lane_u16
-+
-+//uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
-+{
-+ uint8x8x2_t v;
-+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
-+
-+//uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane)
-+{
-+ uint16x4x2_t v;
-+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane)
-+{
-+ uint32x2x2_t v;
-+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane)
-+
-+//int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
-+
-+//int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
-+
-+//int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
-+
-+//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane)
-+{
-+ float32x2x2_t v;
-+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane)
-+
-+//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_p8 vld2_lane_u8
-+
-+//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_p16 vld2_lane_u16
-+
-+//*********** Lane triplets **********************
-+//*************************************************
-+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
-+//we assume src is 16 bit aligned
-+
-+//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ uint16x8x3_t v;
-+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ uint32x4x3_t v;
-+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
-+
-+//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ int16x8x3_t v;
-+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
-+
-+//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ int32x4x3_t v;
-+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
-+
-+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
-+
-+
-+//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ float32x4x3_t v;
-+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
-+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
-+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
-+
-+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+#define vld3q_lane_p16 vld3q_lane_u16
-+
-+//uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ uint8x8x3_t v;
-+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
-+ return v;
-+}
-+#define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
-+
-+//uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ uint16x4x3_t v;
-+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
-+ return v;
-+}
-+#define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ //need to merge into 128 bit anyway
-+ uint32x2x3_t v;
-+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);;
-+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);;
-+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);;
-+ return v;
-+}
-+#define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
-+
-+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
-+
-+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
-+
-+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
-+
-+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ float32x2x3_t v;
-+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
-+ return v;
-+}
-+#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane)
-+
-+//poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_p8 vld3_lane_u8
-+
-+//poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_p16 vld3_lane_u16
-+
-+//******************* Lane Quadruples load ***************************
-+//*********************************************************************
-+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
-+//we assume src is 16 bit aligned
-+
-+//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
-+{
-+ uint16x8x4_t v;
-+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
-+ v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane);
-+ return v;
-+}
-+#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
-+{
-+ uint32x4x4_t v;
-+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
-+ v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane);
-+ return v;
-+}
-+#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
-+
-+//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
-+
-+//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
-+
-+//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
-+{
-+ float32x4x4_t v;
-+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
-+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
-+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
-+ v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
-+ return v;
-+}
-+#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
-+
-+//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+#define vld4q_lane_p16 vld4q_lane_u16
-+
-+//uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
-+{
-+ uint8x8x4_t v;
-+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
-+
-+//uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
-+{
-+ uint16x4x4_t v;
-+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
-+{
-+ uint32x2x4_t v;
-+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
-+
-+//int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);
-+#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
-+
-+//int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);
-+#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
-+
-+//int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);
-+#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
-+
-+//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
-+{
-+ //serial solution may be faster
-+ float32x2x4_t v;
-+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane)
-+
-+//poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
-+#define vld4_lane_p8 vld4_lane_u8
-+
-+//poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);
-+#define vld4_lane_p16 vld4_lane_u16
-+
-+//******************* Store duplets *********************************************
-+//********************************************************************************
-+//here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function
-+//If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions
-+//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
-+{
-+ uint8x16x2_t v;
-+ v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
-+ vst1q_u8 (ptr, v.val[0]);
-+ vst1q_u8 ((ptr + 16), v.val[1]);
-+}
-+#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
-+
-+//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
-+{
-+ uint16x8x2_t v;
-+ v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
-+ vst1q_u16 (ptr, v.val[0]);
-+ vst1q_u16 ((ptr + 8), v.val[1]);
-+}
-+#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
-+
-+//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
-+{
-+ uint32x4x2_t v;
-+ v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
-+ vst1q_u32 (ptr, v.val[0]);
-+ vst1q_u32 ((ptr + 4), v.val[1]);
-+}
-+#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
-+
-+//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
-+#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
-+
-+//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
-+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
-+#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
-+
-+//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
-+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
-+#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
-+
-+//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
-+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
-+{
-+ float32x4x2_t v;
-+ v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
-+ vst1q_f32 (ptr, v.val[0]);
-+ vst1q_f32 ((ptr + 4), v.val[1]);
-+}
-+#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
-+
-+//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
-+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
-+#define vst2q_p8 vst2q_u8
-+
-+//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
-+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
-+#define vst2q_p16 vst2q_u16
-+
-+//void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
-+{
-+ __m128i v0;
-+ v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ vst1q_u8 (ptr, v0);
-+}
-+#define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
-+
-+//void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
-+{
-+ __m128i v0;
-+ v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ vst1q_u16 (ptr, v0);
-+}
-+#define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
-+
-+//void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
-+{
-+ __m128i v0;
-+ v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ vst1q_u32 (ptr, v0);
-+}
-+#define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
-+
-+
-+//void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
-+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
-+_NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
-+{
-+ *(ptr) = val->val[0].m64_u64[0];
-+ *(ptr + 1) = val->val[1].m64_u64[0];
-+}
-+#define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
-+
-+//void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
-+#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
-+
-+//void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
-+#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
-+
-+//void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
-+#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
-+
-+//void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
-+#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
-+
-+//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
-+{
-+ *(ptr) = val->val[0].m64_f32[0];
-+ *(ptr + 1) = val->val[1].m64_f32[0];
-+ *(ptr + 2) = val->val[0].m64_f32[1];
-+ *(ptr + 3) = val->val[1].m64_f32[1];
-+}
-+#define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
-+
-+//void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+#define vst2_p8 vst2_u8
-+
-+//void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+#define vst2_p16 vst2_u16
-+
-+//******************** Triplets store *****************************************
-+//******************************************************************************
-+//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
-+{
-+ uint8x16x3_t v;
-+ __m128i v0,v1,v2, cff, bldmask;
-+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
-+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
-+
-+ v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
-+ v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
-+ v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
-+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
-+ cff = _mm_cmpeq_epi8(v0, v0); //all ff
-+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
-+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
-+ vst1q_u8(ptr, v.val[0]);
-+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
-+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
-+ vst1q_u8((ptr + 16), v.val[1]);
-+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
-+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
-+ vst1q_u8((ptr + 32), v.val[2]);
-+}
-+#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
-+
-+//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
-+{
-+ uint16x8x3_t v;
-+ __m128i v0,v1,v2, cff, bldmask;
-+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
-+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
-+
-+ v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
-+ v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
-+ v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
-+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
-+ cff = _mm_cmpeq_epi16(v0, v0); //all ff
-+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
-+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
-+ vst1q_u16(ptr, v.val[0]);
-+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
-+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
-+ vst1q_u16((ptr + 8), v.val[1]);
-+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
-+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
-+ vst1q_u16((ptr + 16), v.val[2]);
-+}
-+#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
-+
-+//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
-+ uint32x4x3_t v;
-+ __m128i tmp0, tmp1,tmp2;
-+ tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
-+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
-+ tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
-+ v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
-+ v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
-+ v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
-+ tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
-+ v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
-+
-+ vst1q_u32(ptr, v.val[0]);
-+ vst1q_u32((ptr + 4), v.val[1]);
-+ vst1q_u32((ptr + 8), v.val[2]);
-+}
-+#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
-+
-+//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
-+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
-+#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
-+
-+//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
-+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
-+#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
-+
-+//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
-+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
-+#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
-+
-+//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
-+{
-+ float32x4x3_t v;
-+ __m128 tmp0, tmp1,tmp2;
-+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
-+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
-+ tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
-+ v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
-+ v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
-+ v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
-+ tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
-+ v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
-+
-+ vst1q_f32( ptr, v.val[0]);
-+ vst1q_f32( (ptr + 4), v.val[1]);
-+ vst1q_f32( (ptr + 8), v.val[2]);
-+}
-+#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
-+
-+//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
-+#define vst3q_p8 vst3q_u8
-+
-+//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
-+#define vst3q_p16 vst3q_u16
-+
-+//void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
-+{
-+ __m128i tmp, sh0, sh1, val0, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
-+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
-+ _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
-+ _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
-+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) );
-+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
-+ val2 = _pM128i(val->val[2]);
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
-+ val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
-+ vst1q_u8(ptr, val0); //store as 128 bit structure
-+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
-+ val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
-+ _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory
-+}
-+#define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
-+
-+//void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
-+{
-+ __m128i tmp, val0, val1, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
-+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0};
-+ _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1]
-+ _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0]
-+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
-+ val2 = _pM128i(val->val[2]);
-+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
-+ val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
-+ vst1q_u16(ptr, val0); //store as 128 bit structure
-+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
-+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
-+ val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order
-+ _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory
-+}
-+#define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
-+
-+//void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
-+{
-+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
-+ __m128i val0, val1;
-+ val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5
-+ val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
-+ val1 = _mm_srli_si128(val0, 8); //4,5, x,x
-+ _M64((*(__m64_128*)(ptr + 4)), val1);
-+ val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2
-+ val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
-+ vst1q_u32(ptr, val0); //store as 128 bit structure
-+}
-+#define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
-+
-+//void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
-+{
-+ *(ptr) = val->val[0].m64_u64[0];
-+ *(ptr + 1) = val->val[1].m64_u64[0];
-+ *(ptr + 2) = val->val[2].m64_u64[0];
-+}
-+#define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
-+
-+//void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0]
-+#define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
-+
-+//void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0]
-+#define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
-+
-+//void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
-+#define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
-+
-+//void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0]
-+#define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
-+
-+//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
-+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+//void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
-+{
-+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5
-+ *(ptr) = val->val[0].m64_f32[0];
-+ *(ptr + 1) = val->val[1].m64_f32[0];
-+ *(ptr + 2) = val->val[2].m64_f32[0];
-+ *(ptr + 3) = val->val[0].m64_f32[1];
-+ *(ptr + 4) = val->val[1].m64_f32[1];
-+ *(ptr + 5) = val->val[2].m64_f32[1];
-+}
-+#define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
-+
-+//void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
-+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
-+#define vst3_p8 vst3_u8
-+
-+//void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
-+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
-+#define vst3_p16 vst3_s16
-+
-+//*************** Quadruples store ********************************
-+//*********************************************************************
-+//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
-+{
-+ __m128i tmp1, tmp2, res;
-+ tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
-+ tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
-+ res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
-+ vst1q_u8(ptr, res);
-+ res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
-+ vst1q_u8((ptr + 16), res);
-+ tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
-+ tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
-+ res = _mm_unpacklo_epi16(tmp1, tmp2); //
-+ vst1q_u8((ptr + 32), res);
-+ res = _mm_unpackhi_epi16(tmp1, tmp2); //
-+ vst1q_u8((ptr + 48), res);
-+}
-+#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
-+
-+//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
-+{
-+ uint16x8x4_t v;
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
-+ v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
-+ tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
-+ v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
-+ vst1q_u16(ptr, v.val[0]);
-+ vst1q_u16((ptr + 8), v.val[1]);
-+ vst1q_u16((ptr + 16),v.val[2]);
-+ vst1q_u16((ptr + 24), v.val[3]);
-+}
-+#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
-+
-+//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
-+{
-+ uint16x8x4_t v;
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
-+ v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
-+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
-+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
-+ vst1q_u32(ptr, v.val[0]);
-+ vst1q_u32((ptr + 4), v.val[1]);
-+ vst1q_u32((ptr + 8), v.val[2]);
-+ vst1q_u32((ptr + 12), v.val[3]);
-+}
-+#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
-+
-+//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
-+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
-+#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
-+
-+//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
-+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
-+#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
-+
-+//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
-+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
-+#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
-+
-+//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
-+{
-+ __m128 tmp3, tmp2, tmp1, tmp0;
-+ float32x4x4_t v;
-+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
-+ tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
-+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
-+ tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
-+ v.val[0] = _mm_movelh_ps(tmp0, tmp2);
-+ v.val[1] = _mm_movehl_ps(tmp2, tmp0);
-+ v.val[2] = _mm_movelh_ps(tmp1, tmp3);
-+ v.val[3] = _mm_movehl_ps(tmp3, tmp1);
-+ vst1q_f32(ptr, v.val[0]);
-+ vst1q_f32((ptr + 4), v.val[1]);
-+ vst1q_f32((ptr + 8), v.val[2]);
-+ vst1q_f32((ptr + 12), v.val[3]);
-+}
-+#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
-+
-+//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
-+#define vst4q_p8 vst4q_u8
-+
-+//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
-+#define vst4q_p16 vst4q_s16
-+
-+//void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
-+{
-+ __m128i sh0, sh1, val0, val2;
-+ sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
-+ sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
-+ val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
-+ val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
-+ vst1q_u8(ptr, val0);
-+ vst1q_u8((ptr + 16), val2);
-+}
-+#define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
-+
-+//void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
-+{
-+ __m128i sh0, sh1, val0, val2;
-+ sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
-+ sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
-+ val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
-+ val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
-+ vst1q_u16(ptr, val0); //store as 128 bit structure
-+ vst1q_u16((ptr + 8), val2);
-+}
-+#define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
-+
-+//void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
-+{
-+ //0,4, 1,5, 2,6, 3,7
-+ __m128i sh0, sh1, val0, val1;
-+ sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5
-+ sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7
-+ val0 = _mm_unpacklo_epi64(sh0,sh1); //
-+ val1 = _mm_unpackhi_epi64(sh0,sh1); //
-+ vst1q_u32(ptr, val0); //store as 128 bit structure
-+ vst1q_u32((ptr + 4), val1);
-+}
-+#define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
-+
-+//void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
-+{
-+ *(ptr) = val->val[0].m64_u64[0];
-+ *(ptr + 1) = val->val[1].m64_u64[0];
-+ *(ptr + 2) = val->val[2].m64_u64[0];
-+ *(ptr + 3) = val->val[3].m64_u64[0];
-+}
-+#define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
-+
-+//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0]
-+#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
-+
-+//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0]
-+#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
-+
-+//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
-+#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
-+
-+//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
-+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
-+#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
-+
-+//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+//void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
-+{
-+ //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7
-+ *(ptr) = val->val[0].m64_f32[0];
-+ *(ptr + 1) = val->val[1].m64_f32[0];
-+ *(ptr + 2) = val->val[2].m64_f32[0];
-+ *(ptr + 3) = val->val[3].m64_f32[0];
-+ *(ptr + 4) = val->val[0].m64_f32[1];
-+ *(ptr + 5) = val->val[1].m64_f32[1];
-+ *(ptr + 6) = val->val[2].m64_f32[1];
-+ *(ptr + 7) = val->val[3].m64_f32[1];
-+}
-+#define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
-+
-+//void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);
-+#define vst4_p8 vst4_u8
-+
-+//void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);
-+#define vst4_p16 vst4_u16
-+
-+//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors *********************
-+//********************************************************************************************************************
-+//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
-+{
-+ vst1q_lane_s16(ptr, val->val[0], lane);
-+ vst1q_lane_s16((ptr + 1), val->val[1], lane);
-+}
-+#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_u32(ptr, val->val[0], lane);
-+ vst1q_lane_u32((ptr + 1), val->val[1], lane);
-+}
-+#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
-+#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
-+
-+//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
-+#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane)
-+
-+//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_f32(ptr, val->val[0], lane);
-+ vst1q_lane_f32((ptr + 1), val->val[1], lane);
-+}
-+#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
-+
-+//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
-+#define vst2q_lane_p16 vst2q_lane_s16
-+
-+//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
-+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
-+{
-+ *(ptr) = val->val[0].m64_u8[lane];
-+ *(ptr + 1) = val->val[1].m64_u8[lane];
-+}
-+#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane)
-+
-+//void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u16[lane];
-+ *(ptr + 1) = val->val[1].m64_u16[lane];
-+}
-+#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
-+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u32[lane];
-+ *(ptr + 1) = val->val[1].m64_u32[lane];
-+}
-+#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
-+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
-+#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane)
-+
-+//void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
-+#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane)
-+
-+//void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
-+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
-+#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane)
-+
-+//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_f32[lane];
-+ *(ptr + 1) = val->val[1].m64_f32[lane];
-+}
-+#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane)
-+
-+//void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
-+#define vst2_lane_p8 vst2_lane_u8
-+
-+//void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-+#define vst2_lane_p16 vst2_lane_u16
-+
-+//************************* Triple lanes stores *******************************************************
-+//*******************************************************************************************************
-+//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
-+{
-+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
-+ vst1q_lane_u16((ptr + 2), val->val[2], lane);
-+}
-+#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
-+{
-+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
-+ vst1q_lane_u32((ptr + 2), val->val[2], lane);
-+}
-+#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
-+
-+//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
-+#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
-+
-+//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_f32(ptr, val->val[0], lane);
-+ vst1q_lane_f32((ptr + 1), val->val[1], lane);
-+ vst1q_lane_f32((ptr + 2), val->val[2], lane);
-+}
-+#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3q_lane_p16 vst3q_lane_s16
-+
-+//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u8[lane];
-+ *(ptr + 1) = val->val[1].m64_u8[lane];
-+ *(ptr + 2) = val->val[2].m64_u8[lane];
-+}
-+#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane)
-+
-+//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u16[lane];
-+ *(ptr + 1) = val->val[1].m64_u16[lane];
-+ *(ptr + 2) = val->val[2].m64_u16[lane];
-+}
-+#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u32[lane];
-+ *(ptr + 1) = val->val[1].m64_u32[lane];
-+ *(ptr + 2) = val->val[2].m64_u32[lane];
-+}
-+#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
-+
-+//void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);
-+#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
-+
-+//void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);
-+#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
-+
-+//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
-+_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_f32[lane];
-+ *(ptr + 1) = val->val[1].m64_f32[lane];
-+ *(ptr + 2) = val->val[2].m64_f32[lane];
-+}
-+#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3_lane_p8 vst3_lane_u8
-+
-+//void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);
-+#define vst3_lane_p16 vst3_lane_s16
-+
-+//******************************** Quadruple lanes stores ***********************************************
-+//*******************************************************************************************************
-+//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
-+{
-+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane);
-+ vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
-+}
-+#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
-+{
-+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane);
-+ vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
-+}
-+#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
-+#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
-+
-+//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
-+#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
-+
-+//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_f32(ptr, val->val[0], lane);
-+ vst1q_lane_f32((ptr + 1), val->val[1], lane);
-+ vst1q_lane_f32((ptr + 2), val->val[2], lane);
-+ vst1q_lane_f32((ptr + 3), val->val[3], lane);
-+}
-+#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
-+#define vst4q_lane_p16 vst4q_lane_u16
-+
-+//void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u8[lane];
-+ *(ptr + 1) = val->val[1].m64_u8[lane];
-+ *(ptr + 2) = val->val[2].m64_u8[lane];
-+ *(ptr + 3) = val->val[3].m64_u8[lane];
-+}
-+#define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
-+
-+//void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u16[lane];
-+ *(ptr + 1) = val->val[1].m64_u16[lane];
-+ *(ptr + 2) = val->val[2].m64_u16[lane];
-+ *(ptr + 3) = val->val[3].m64_u16[lane];
-+}
-+#define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u32[lane];
-+ *(ptr + 1) = val->val[1].m64_u32[lane];
-+ *(ptr + 2) = val->val[2].m64_u32[lane];
-+ *(ptr + 3) = val->val[3].m64_u32[lane];
-+}
-+#define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
-+
-+//void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
-+
-+//void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
-+
-+//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
-+//current IA SIMD doesn't support float16
-+
-+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_f32[lane];
-+ *(ptr + 1) = val->val[1].m64_f32[lane];
-+ *(ptr + 2) = val->val[2].m64_f32[lane];
-+ *(ptr + 3) = val->val[3].m64_f32[lane];
-+}
-+#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
-+#define vst4_lane_p8 vst4_lane_u8
-+
-+//void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);
-+#define vst4_lane_p16 vst4_lane_u16
-+
-+//**************************************************************************************************
-+//************************ Extract lanes from a vector ********************************************
-+//**************************************************************************************************
-+//These intrinsics extract a single lane (element) from a vector.
-+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+#define vget_lane_u8(vec, lane) vec.m64_u8[lane]
-+
-+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
-+#define vget_lane_u16(vec, lane) vec.m64_u16[lane]
-+
-+
-+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+#define vget_lane_u32(vec, lane) vec.m64_u32[lane]
-+
-+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
-+#define vget_lane_s8(vec, lane) vec.m64_i8[lane]
-+
-+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
-+#define vget_lane_s16(vec, lane) vec.m64_i16[lane]
-+
-+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+#define vget_lane_s32(vec, lane) vec.m64_i32[lane]
-+
-+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+#define vget_lane_p8 vget_lane_u8
-+
-+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
-+#define vget_lane_p16 vget_lane_u16
-+
-+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+#define vget_lane_f32(vec, lane) vec.m64_f32[lane]
-+
-+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+#define vgetq_lane_u8 _MM_EXTRACT_EPI8
-+
-+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
-+#define vgetq_lane_u16 _MM_EXTRACT_EPI16
-+
-+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+#define vgetq_lane_u32 _MM_EXTRACT_EPI32
-+
-+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
-+#define vgetq_lane_s8 vgetq_lane_u8
-+
-+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
-+#define vgetq_lane_s16 vgetq_lane_u16
-+
-+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+#define vgetq_lane_s32 vgetq_lane_u32
-+
-+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+#define vgetq_lane_p8 vgetq_lane_u8
-+
-+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
-+#define vgetq_lane_p16 vgetq_lane_u16
-+
-+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
-+{
-+ int32_t ilane;
-+ ilane = _MM_EXTRACT_PS(vec,lane);
-+ return *(float*)&ilane;
-+}
-+
-+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+#define vget_lane_s64(vec, lane) vec.m64_i64[0]
-+
-+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+#define vget_lane_u64(vec, lane) vec.m64_u64[0]
-+
-+
-+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+#define vgetq_lane_s64 (int64_t) vgetq_lane_u64
-+
-+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+#define vgetq_lane_u64 _MM_EXTRACT_EPI64
-+
-+// ***************** Set lanes within a vector ********************************************
-+// **************************************************************************************
-+//These intrinsics set a single lane (element) within a vector.
-+//same functions as vld1_lane_xx ones, but take the value to be set directly.
-+
-+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint8_t val;
-+ val = value;
-+ return vld1_lane_u8(&val, vec, lane);
-+}
-+
-+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint16_t val;
-+ val = value;
-+ return vld1_lane_u16(&val, vec, lane);
-+}
-+
-+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint32_t val;
-+ val = value;
-+ return vld1_lane_u32(&val, vec, lane);
-+}
-+
-+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
-+{
-+ int8_t val;
-+ val = value;
-+ return vld1_lane_s8(&val, vec, lane);
-+}
-+
-+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
-+{
-+ int16_t val;
-+ val = value;
-+ return vld1_lane_s16(&val, vec, lane);
-+}
-+
-+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
-+{
-+ int32_t val;
-+ val = value;
-+ return vld1_lane_s32(&val, vec, lane);
-+}
-+
-+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+#define vset_lane_p8 vset_lane_u8
-+
-+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+#define vset_lane_p16 vset_lane_u16
-+
-+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
-+{
-+ float32_t val;
-+ val = value;
-+ return vld1_lane_f32(&val, vec, lane);
-+}
-+
-+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
-+{
-+ uint8_t val;
-+ val = value;
-+ return vld1q_lane_u8(&val, vec, lane);
-+}
-+
-+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint16_t val;
-+ val = value;
-+ return vld1q_lane_u16(&val, vec, lane);
-+}
-+
-+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint32_t val;
-+ val = value;
-+ return vld1q_lane_u32(&val, vec, lane);
-+}
-+
-+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
-+{
-+ int8_t val;
-+ val = value;
-+ return vld1q_lane_s8(&val, vec, lane);
-+}
-+
-+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
-+{
-+ int16_t val;
-+ val = value;
-+ return vld1q_lane_s16(&val, vec, lane);
-+}
-+
-+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
-+{
-+ int32_t val;
-+ val = value;
-+ return vld1q_lane_s32(&val, vec, lane);
-+}
-+
-+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+#define vsetq_lane_p8 vsetq_lane_u8
-+
-+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+#define vsetq_lane_p16 vsetq_lane_u16
-+
-+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
-+{
-+ float32_t val;
-+ val = value;
-+ return vld1q_lane_f32(&val, vec, lane);
-+}
-+
-+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
-+{
-+ int64_t val;
-+ val = value;
-+ return vld1_lane_s64(&val, vec, lane);
-+}
-+
-+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
-+{
-+ uint64_t val;
-+ val = value;
-+ return vld1_lane_u64(&val, vec, lane);
-+}
-+
-+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint64_t val;
-+ val = value;
-+ return vld1q_lane_s64(&val, vec, lane);
-+}
-+
-+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+#define vsetq_lane_u64 vsetq_lane_s64
-+
-+// *******************************************************************************
-+// **************** Initialize a vector from bit pattern ***************************
-+// *******************************************************************************
-+//These intrinsics create a vector from a literal bit pattern.
-+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s8(a) (*(__m64_128*)&(a))
-+
-+
-+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s16 vcreate_s8
-+
-+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s32 vcreate_s8
-+
-+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
-+//no IA32 SIMD avalilable
-+
-+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_f32(a) (*(__m64_128*)&(a))
-+
-+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u8 vcreate_s8
-+
-+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u16 vcreate_s16
-+
-+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u32 vcreate_s32
-+
-+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u64 vcreate_s8
-+
-+
-+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_p8 vcreate_u8
-+
-+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_p16 vcreate_u16
-+
-+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s64 vcreate_u64
-+
-+//********************* Set all lanes to same value ********************************
-+//*********************************************************************************
-+//These intrinsics set all lanes to the same value.
-+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint8x8_t res;
-+ int i;
-+ for (i = 0; i<8; i++) {
-+ res.m64_u8[i] = value;
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint16x4_t res;
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ res.m64_u16[i] = value;
-+ }
-+ return res;
-+}
-+
-+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = value;
-+ res.m64_u32[1] = value;
-+ return res;
-+}
-+
-+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int8x8_t res;
-+ int i;
-+ for (i = 0; i<8; i++) {
-+ res.m64_i8[i] = value;
-+ }
-+ return res;
-+}
-+
-+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int16x4_t res;
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ res.m64_i16[i] = value;
-+ }
-+ return res;
-+}
-+
-+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t res;
-+ res.m64_i32[0] = value;
-+ res.m64_i32[1] = value;
-+ return res;
-+}
-+
-+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
-+#define vdup_n_p8 vdup_n_u8
-+
-+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
-+#define vdup_n_p16 vdup_n_s16
-+
-+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
-+_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = value;
-+ res.m64_f32[1] = value;
-+ return res;
-+}
-+
-+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
-+
-+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
-+
-+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
-+
-+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
-+#define vdupq_n_s8 _mm_set1_epi8
-+
-+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
-+#define vdupq_n_s16 _mm_set1_epi16
-+
-+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
-+#define vdupq_n_s32 _mm_set1_epi32
-+
-+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+#define vdupq_n_p8 vdupq_n_u8
-+
-+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+#define vdupq_n_p16 vdupq_n_u16
-+
-+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
-+#define vdupq_n_f32 _mm_set1_ps
-+
-+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = value;
-+ return res;
-+}
-+
-+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = value;
-+ return res;
-+}
-+
-+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value)
-+{
-+ _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
-+ return LOAD_SI128(value2);
-+}
-+
-+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
-+ return LOAD_SI128(val);
-+}
-+
-+//**** Set all lanes to same value ************************
-+//Same functions as above - just aliaces.********************
-+//Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
-+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
-+#define vmov_n_u8 vdup_n_s8
-+
-+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
-+#define vmov_n_u16 vdup_n_s16
-+
-+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
-+#define vmov_n_u32 vdup_n_u32
-+
-+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
-+#define vmov_n_s8 vdup_n_s8
-+
-+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
-+#define vmov_n_s16 vdup_n_s16
-+
-+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
-+#define vmov_n_s32 vdup_n_s32
-+
-+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
-+#define vmov_n_p8 vdup_n_u8
-+
-+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
-+#define vmov_n_p16 vdup_n_s16
-+
-+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
-+#define vmov_n_f32 vdup_n_f32
-+
-+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+#define vmovq_n_u8 vdupq_n_u8
-+
-+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+#define vmovq_n_u16 vdupq_n_s16
-+
-+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+#define vmovq_n_u32 vdupq_n_u32
-+
-+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
-+#define vmovq_n_s8 vdupq_n_s8
-+
-+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
-+#define vmovq_n_s16 vdupq_n_s16
-+
-+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
-+#define vmovq_n_s32 vdupq_n_s32
-+
-+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+#define vmovq_n_p8 vdupq_n_u8
-+
-+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+#define vmovq_n_p16 vdupq_n_s16
-+
-+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
-+#define vmovq_n_f32 vdupq_n_f32
-+
-+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
-+#define vmov_n_s64 vdup_n_s64
-+
-+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
-+#define vmov_n_u64 vdup_n_u64
-+
-+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
-+#define vmovq_n_s64 vdupq_n_s64
-+
-+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+#define vmovq_n_u64 vdupq_n_u64
-+
-+//**************Set all lanes to the value of one lane of a vector *************
-+//****************************************************************************
-+//here shuffle is better solution than lane extraction followed by set1 function
-+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint8x8_t res;
-+ uint8_t valane;
-+ int i = 0;
-+ valane = vec.m64_u8[lane];
-+ for (i = 0; i<8; i++) {
-+ res.m64_u8[i] = valane;
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint16x4_t res;
-+ uint16_t valane;
-+ valane = vec.m64_u16[lane];
-+ res.m64_u16[0] = valane;
-+ res.m64_u16[1] = valane;
-+ res.m64_u16[2] = valane;
-+ res.m64_u16[3] = valane;
-+ return res;
-+}
-+
-+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = vec.m64_u32[lane];
-+ res.m64_u32[1] = res.m64_u32[0];
-+ return res;
-+}
-+
-+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+#define vdup_lane_s8 vdup_lane_u8
-+
-+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+#define vdup_lane_s16 vdup_lane_u16
-+
-+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+#define vdup_lane_s32 vdup_lane_u32
-+
-+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+#define vdup_lane_p8 vdup_lane_u8
-+
-+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+#define vdup_lane_p16 vdup_lane_s16
-+
-+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = vec.m64_f32[lane];
-+ res.m64_f32[1] = res.m64_f32[0];
-+ return res;
-+}
-+
-+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
-+{
-+ _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane};
-+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
-+}
-+
-+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
-+{
-+ //we could use 8bit shuffle for 16 bit as well
-+ const int8_t lane16 = ((int8_t) lane) << 1;
-+ _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1,
-+ lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1};
-+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
-+}
-+
-+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+#define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6))
-+
-+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+#define vdupq_lane_s8 vdupq_lane_u8
-+
-+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+#define vdupq_lane_s16 vdupq_lane_u16
-+
-+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+#define vdupq_lane_s32 vdupq_lane_u32
-+
-+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+#define vdupq_lane_p8 vdupq_lane_u8
-+
-+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+#define vdupq_lane_p16 vdupq_lane_s16
-+
-+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane))
-+
-+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+#define vdup_lane_s64(vec,lane) vec
-+
-+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+#define vdup_lane_u64(vec,lane) vec
-+
-+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
-+{
-+ __m128i vec128;
-+ vec128 = _pM128i(vec);
-+ return _mm_unpacklo_epi64(vec128,vec128);
-+}
-+
-+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+#define vdupq_lane_u64 vdupq_lane_s64
-+
-+// ********************************************************************
-+// ******************** Combining vectors *****************************
-+// ********************************************************************
-+//These intrinsics join two 64 bit vectors into a single 128bit vector.
-+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
-+#define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
-+#define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
-+#define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
-+#define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
-+//current IA SIMD doesn't support float16
-+
-+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
-+_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
-+{
-+ __m128i res;
-+ res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
-+ return _M128(res);
-+}
-+
-+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
-+#define vcombine_u8 vcombine_s8
-+
-+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
-+#define vcombine_u16 vcombine_s16
-+
-+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
-+#define vcombine_u32 vcombine_s32
-+
-+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
-+#define vcombine_u64 vcombine_s64
-+
-+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
-+#define vcombine_p8 vcombine_u8
-+
-+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
-+#define vcombine_p16 vcombine_u16
-+
-+//**********************************************************************
-+//************************* Splitting vectors **************************
-+//**********************************************************************
-+//**************** Get high part ******************************************
-+//These intrinsics split a 128 bit vector into 2 component 64 bit vectors
-+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
-+{
-+ int64x1_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
-+{
-+ __m128i res;
-+ __m64_128 res64;
-+ res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
-+ return64(res);
-+}
-+
-+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
-+#define vget_high_u8 vget_high_s8
-+
-+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
-+#define vget_high_u16 vget_high_s16
-+
-+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
-+#define vget_high_u32 vget_high_s32
-+
-+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
-+#define vget_high_u64 vget_high_s64
-+
-+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
-+#define vget_high_p8 vget_high_u8
-+
-+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
-+#define vget_high_p16 vget_high_u16
-+
-+//********************** Get low part **********************
-+//**********************************************************
-+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
-+{
-+ int16x4_t res64;
-+ return64(a);
-+}
-+
-+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
-+{
-+ int16x4_t res64;
-+ return64(a);
-+}
-+
-+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
-+{
-+ int32x2_t res64;
-+ return64(a);
-+}
-+
-+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
-+{
-+ int64x1_t res64;
-+ return64 (a);
-+}
-+
-+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
-+{
-+ float32x2_t res64;
-+ _M64f(res64, a);
-+ return res64;
-+}
-+
-+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
-+#define vget_low_u8 vget_low_s8
-+
-+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
-+#define vget_low_u16 vget_low_s16
-+
-+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
-+#define vget_low_u32 vget_low_s32
-+
-+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
-+#define vget_low_u64 vget_low_s64
-+
-+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
-+#define vget_low_p8 vget_low_u8
-+
-+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
-+#define vget_low_p16 vget_low_s16
-+
-+//**************************************************************************
-+//************************ Converting vectors **********************************
-+//**************************************************************************
-+//************* Convert from float ***************************************
-+// need to set _MM_SET_ROUNDING_MODE ( x) accordingly
-+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
-+_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only
-+ return64(res);
-+}
-+
-+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
-+{
-+ //may be not effective compared with a serial SIMD solution
-+ uint32x2_t res64;
-+ __m128i res;
-+ res = vcvtq_u32_f32(_pM128(a));
-+ return64(res);
-+}
-+
-+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
-+#define vcvtq_s32_f32 _mm_cvttps_epi32
-+
-+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
-+{
-+ //No single instruction SSE solution but we could implement it as following:
-+ __m128i resi;
-+ __m128 zero, mask, a_pos, mask_f_max_si, res;
-+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-+ zero = _mm_setzero_ps();
-+ mask = _mm_cmpgt_ps(a, zero);
-+ a_pos = _mm_and_ps(a, mask);
-+ mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
-+ res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything
-+ resi = _mm_cvttps_epi32(res);
-+ return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
-+}
-+
-+// ***** Convert to the fixed point with the number of fraction bits specified by b ***********
-+//*************************************************************************************************
-+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
-+_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ return64(vcvtq_n_s32_f32(_pM128(a),b));
-+}
-+
-+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
-+_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res;
-+ float convconst;
-+ convconst = (float)((uint32_t)1 << b);
-+ res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
-+ res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
-+ return res;
-+}
-+
-+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
-+_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ __m128 cconst128;
-+ __m128i mask, res;
-+ convconst = (float)(1 << b);
-+ cconst128 = vdupq_n_f32(convconst);
-+ res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
-+ mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
-+ return _mm_xor_si128 (res, mask); //res saturated for 0x80000000
-+}
-+
-+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
-+_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ __m128 cconst128;
-+ convconst = (float)(1 << b);
-+ cconst128 = vdupq_n_f32(convconst);
-+ return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
-+}
-+
-+//***************** Convert to float *************************
-+//*************************************************************
-+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
-+_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = (float) a.m64_i32[0];
-+ res.m64_f32[1] = (float) a.m64_i32[1];
-+ return res;
-+}
-+
-+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
-+_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = (float) a.m64_u32[0];
-+ res.m64_f32[1] = (float) a.m64_u32[1];
-+ return res;
-+}
-+
-+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
-+#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
-+
-+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
-+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
-+{
-+ //solution may be not optimal
-+ __m128 two16, fHi, fLo;
-+ __m128i hi, lo;
-+ two16 = _mm_set1_ps((float)0x10000); //2^16
-+ // Avoid double rounding by doing two exact conversions
-+ // of high and low 16-bit segments
-+ hi = _mm_srli_epi32(a, 16);
-+ lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
-+ fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
-+ fLo = _mm_cvtepi32_ps(lo);
-+ // do single rounding according to current rounding mode
-+ return _mm_add_ps(fHi, fLo);
-+}
-+
-+// ***** Convert to the float from fixed point with the number of fraction bits specified by b ***********
-+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
-+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
-+{
-+ float32x2_t res;
-+ float convconst;
-+ convconst = (float)(1. / ((uint32_t)1 << b));
-+ res.m64_f32[0] = a.m64_i32[0] * convconst;
-+ res.m64_f32[1] = a.m64_i32[1] * convconst;
-+ return res;
-+}
-+
-+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
-+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
-+{
-+ float32x2_t res;
-+ float convconst;
-+ convconst = (float)(1. / ((uint32_t)1 << b));
-+ res.m64_f32[0] = a.m64_u32[0] * convconst;
-+ res.m64_f32[1] = a.m64_u32[1] * convconst;
-+ return res;
-+}
-+
-+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
-+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ __m128 cconst128, af;
-+ convconst = (float)(1. / ((uint32_t)1 << b));
-+ af = _mm_cvtepi32_ps(a);
-+ cconst128 = vdupq_n_f32(convconst);
-+ return _mm_mul_ps(af,cconst128);
-+}
-+
-+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
-+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ __m128 cconst128, af;
-+ convconst = (float)(1. / (1 << b));
-+ af = vcvtq_f32_u32(a);
-+ cconst128 = vdupq_n_f32(convconst);
-+ return _mm_mul_ps(af,cconst128);
-+}
-+
-+//**************Convert between floats ***********************
-+//************************************************************
-+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
-+//Intel SIMD doesn't support 16bits floats curently
-+
-+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
-+//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
-+
-+//************Vector narrow integer conversion (truncation) ******************
-+//****************************************************************************
-+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
-+_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
-+ return64(res);
-+}
-+
-+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
-+_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
-+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
-+ return64(res);
-+}
-+
-+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
-+_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
-+{
-+ //may be not effective compared with a serial implementation
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
-+ return64(res);
-+}
-+
-+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
-+#define vmovn_u16 vmovn_s16
-+
-+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
-+#define vmovn_u32 vmovn_s32
-+
-+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
-+#define vmovn_u64 vmovn_s64
-+
-+//**************** Vector long move ***********************
-+//***********************************************************
-+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
-+#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1
-+
-+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
-+#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1
-+
-+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
-+#define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1
-+
-+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
-+#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1
-+
-+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
-+#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1
-+
-+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
-+#define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1
-+
-+//*************Vector saturating narrow integer*****************
-+//**************************************************************
-+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
-+_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _mm_packs_epi16(a, a);
-+ return64(res);
-+}
-+
-+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
-+_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = _mm_packs_epi32(a, a);
-+ return64(res);
-+}
-+
-+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
-+{
-+ int32x2_t res;
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
-+ if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
-+ if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
-+ if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
-+ res.m64_i32[0] = (int32_t)atmp[0];
-+ res.m64_i32[1] = (int32_t)atmp[1];
-+ return res;
-+}
-+
-+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
-+_NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
-+{
-+ //no uint16 to uint8 conversion in SSE, need truncate to max signed first
-+ uint8x8_t res64;
-+ __m128i c7fff, a_trunc;
-+ c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
-+ a_trunc = _mm_and_si128(a, c7fff); // a truncated to max signed
-+ a_trunc = _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
-+ return64(a_trunc);
-+}
-+
-+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
-+_NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
-+{
-+ //no uint32 to uint16 conversion in SSE, need truncate to max signed first
-+ uint16x4_t res64;
-+ __m128i c7fffffff, a_trunc;
-+ c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
-+ a_trunc = _mm_and_si128(a, c7fffffff); // a truncated to max signed
-+ a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
-+ return64(a_trunc);
-+}
-+
-+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
-+_NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
-+{
-+ //serial solution may be faster
-+ uint32x2_t res64;
-+ __m128i res_hi, mask;
-+ mask = _mm_setzero_si128();
-+ res_hi = _mm_srli_epi64(a, 32);
-+ res_hi = _mm_cmpeq_epi32(res_hi, mask);
-+ mask = _mm_cmpeq_epi32(mask,mask); //all fff
-+ mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
-+ res_hi = _mm_or_si128(a, mask);
-+ res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(res_hi);
-+}
-+//************* Vector saturating narrow integer signed->unsigned **************
-+//*****************************************************************************
-+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
-+_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
-+{
-+ uint8x8_t res64;
-+ __m128i res;
-+ res = _mm_packus_epi16(a, a); //use low 64bits only
-+ return64(res);
-+}
-+
-+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
-+_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
-+{
-+ uint16x4_t res64;
-+ __m128i res;
-+ res = _MM_PACKUS1_EPI32(a); //use low 64bits only
-+ return64(res);
-+}
-+
-+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
-+_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
-+{
-+ uint32x2_t res64;
-+ __m128i res_hi,res_lo, zero, cmp;
-+ zero = _mm_setzero_si128();
-+ res_hi = _mm_srli_epi64(a, 32);
-+ cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
-+ res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
-+ cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
-+ res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
-+ res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(res_lo);
-+}
-+
-+// ********************************************************
-+// **************** Table look up **************************
-+// ********************************************************
-+//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
-+//in a table and generate a new vector. Indexes out of range return 0.
-+//for Intel SIMD we need to set the MSB to 1 for zero return
-+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ __m128i c7, maskgt, bmask, b128;
-+ c7 = _mm_set1_epi8 (7);
-+ b128 = _pM128i(b);
-+ maskgt = _mm_cmpgt_epi8(b128,c7);
-+ bmask = _mm_or_si128(b128,maskgt);
-+ bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
-+ return64(bmask);
-+}
-+
-+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
-+#define vtbl1_s8 vtbl1_u8
-+
-+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+#define vtbl1_p8 vtbl1_u8
-+
-+//Special trick to avoid __declspec(align('8')) won't be aligned" error
-+//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ __m128i c15, a01, maskgt15, bmask, b128;
-+ c15 = _mm_set1_epi8 (15);
-+ b128 = _pM128i(b);
-+ maskgt15 = _mm_cmpgt_epi8(b128,c15);
-+ bmask = _mm_or_si128(b128, maskgt15);
-+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1]));
-+ a01 = _mm_shuffle_epi8(a01, bmask);
-+ return64(a01);
-+}
-+#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b)
-+
-+//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+#define vtbl2_s8 vtbl2_u8
-+
-+//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+#define vtbl2_p8 vtbl2_u8
-+
-+//Special trick to avoid __declspec(align('16')) won't be aligned" error
-+//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
-+ c15 = _mm_set1_epi8 (15);
-+ c23 = _mm_set1_epi8 (23);
-+ b128 = _pM128i(b);
-+ maskgt23 = _mm_cmpgt_epi8(b128,c23);
-+ bmask = _mm_or_si128(b128, maskgt23);
-+ maskgt15 = _mm_cmpgt_epi8(b128,c15);
-+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
-+ sh0 = _mm_shuffle_epi8(a01, bmask);
-+ sh1 = _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
-+ return64(sh0);
-+}
-+#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b)
-+
-+//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+#define vtbl3_s8 vtbl3_u8
-+
-+//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+#define vtbl3_p8 vtbl3_u8
-+
-+//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
-+ c15 = _mm_set1_epi8 (15);
-+ c31 = _mm_set1_epi8 (31);
-+ b128 = _pM128i(b);
-+ maskgt31 = _mm_cmpgt_epi8(b128,c31);
-+ bmask = _mm_or_si128(b128, maskgt31);
-+ maskgt15 = _mm_cmpgt_epi8(b128,c15);
-+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
-+ a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3]));
-+ sh0 = _mm_shuffle_epi8(a01, bmask);
-+ sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
-+ return64(sh0);
-+}
-+#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b)
-+
-+//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+#define vtbl4_s8 vtbl4_u8
-+
-+//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+#define vtbl4_p8 vtbl4_u8
-+
-+//****************** Extended table look up intrinsics ***************************
-+//**********************************************************************************
-+//VTBX (Vector Table Extension) works in the same way as VTBL do,
-+// except that indexes out of range leave the destination element unchanged.
-+
-+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+ uint8x8_t res64;
-+ __m128i c7, maskgt, sh, c128;
-+ c7 = _mm_set1_epi8 (7);
-+ c128 = _pM128i(c);
-+ maskgt = _mm_cmpgt_epi8(c128,c7);
-+ c7 = _mm_and_si128(maskgt,_pM128i(a));
-+ sh = _mm_shuffle_epi8(_pM128i(b),c128);
-+ sh = _mm_andnot_si128(maskgt,sh);
-+ sh = _mm_or_si128(sh,c7);
-+ return64(sh);
-+}
-+
-+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
-+#define vtbx1_s8 vtbx1_u8
-+
-+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+#define vtbx1_p8 vtbx1_u8
-+
-+//Special trick to avoid __declspec(align('8')) won't be aligned" error
-+//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c)
-+{
-+ uint8x8_t res64;
-+ __m128i c15, b01, maskgt15, sh, c128;
-+ c15 = _mm_set1_epi8 (15);
-+ c128 = _pM128i(c);
-+ maskgt15 = _mm_cmpgt_epi8(c128, c15);
-+ c15 = _mm_and_si128(maskgt15, _pM128i(a));
-+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1]));
-+ sh = _mm_shuffle_epi8(b01, c128);
-+ sh = _mm_andnot_si128(maskgt15, sh);
-+ sh = _mm_or_si128(sh,c15);
-+ return64(sh);
-+}
-+#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c)
-+
-+//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+#define vtbx2_s8 vtbx2_u8
-+
-+//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+#define vtbx2_p8 vtbx2_u8
-+
-+//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
-+ c15 = _mm_set1_epi8 (15);
-+ c23 = _mm_set1_epi8 (23);
-+ c128 = _pM128i(c);
-+ maskgt15 = _mm_cmpgt_epi8(c128,c15);
-+ maskgt23 = _mm_cmpgt_epi8(c128,c23);
-+ c23 = _mm_and_si128(maskgt23, _pM128i(a));
-+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
-+ sh0 = _mm_shuffle_epi8(b01, c128);
-+ sh1 = _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
-+ sh0 = _mm_andnot_si128(maskgt23,sh0);
-+ sh0 = _mm_or_si128(sh0,c23);
-+ return64(sh0);
-+}
-+#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c)
-+
-+//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c);
-+#define vtbx3_s8 vtbx3_u8
-+
-+//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c);
-+#define vtbx3_p8 vtbx3_u8
-+
-+//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
-+ c15 = _mm_set1_epi8 (15);
-+ c31 = _mm_set1_epi8 (31);
-+ c128 = _pM128i(c);
-+ maskgt15 = _mm_cmpgt_epi8(c128,c15);
-+ maskgt31 = _mm_cmpgt_epi8(c128,c31);
-+ c31 = _mm_and_si128(maskgt31, _pM128i(a));
-+
-+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
-+ b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3]));
-+ sh0 = _mm_shuffle_epi8(b01, c128);
-+ sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
-+ sh0 = _mm_andnot_si128(maskgt31,sh0);
-+ sh0 = _mm_or_si128(sh0,c31);
-+ return64(sh0);
-+}
-+#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c)
-+
-+//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c);
-+#define vtbx4_s8 vtbx4_u8
-+
-+//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c);
-+#define vtbx4_p8 vtbx4_u8
-+
-+//*************************************************************************************************
-+// *************************** Operations with a scalar value *********************************
-+//*************************************************************************************************
-+
-+//******* Vector multiply accumulate by scalar *************************************************
-+//**********************************************************************************************
-+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
-+{
-+ int16_t c;
-+ int16x4_t scalar;
-+ c = vget_lane_s16(v, l);
-+ scalar = vdup_n_s16(c);
-+ return vmla_s16(a, b, scalar);
-+}
-+
-+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
-+{
-+ int32_t c;
-+ int32x2_t scalar;
-+ c = vget_lane_s32(v, l);
-+ scalar = vdup_n_s32(c);
-+ return vmla_s32(a, b, scalar);
-+}
-+
-+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
-+#define vmla_lane_u16 vmla_lane_s16
-+
-+
-+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
-+#define vmla_lane_u32 vmla_lane_s32
-+
-+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
-+{
-+ float32_t vlane;
-+ float32x2_t c;
-+ vlane = vget_lane_f32(v, l);
-+ c = vdup_n_f32(vlane);
-+ return vmla_f32(a,b,c);
-+}
-+
-+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x8_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdupq_n_s16(vlane);
-+ return vmlaq_s16(a,b,c);
-+}
-+
-+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x4_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdupq_n_s32(vlane);
-+ return vmlaq_s32(a,b,c);
-+}
-+
-+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
-+#define vmlaq_lane_u16 vmlaq_lane_s16
-+
-+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
-+#define vmlaq_lane_u32 vmlaq_lane_s32
-+
-+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
-+{
-+ float32_t vlane;
-+ float32x4_t c;
-+ vlane = vget_lane_f32(v, l);
-+ c = vdupq_n_f32(vlane);
-+ return vmlaq_f32(a,b,c);
-+}
-+
-+//***************** Vector widening multiply accumulate by scalar **********************
-+//***************************************************************************************
-+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmlal_s16(a, b, c);
-+}
-+
-+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vmlal_s32(a, b, c);
-+}
-+
-+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t c;
-+ vlane = vget_lane_u16(v, l);
-+ c = vdup_n_u16(vlane);
-+ return vmlal_u16(a, b, c);
-+}
-+
-+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdup_n_u32(vlane);
-+ return vmlal_u32(a, b, c);
-+}
-+
-+// ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
-+// ************************************************************************************************
-+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vqdmlal_s16(a, b, c);
-+}
-+
-+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
-+{
-+ int32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vqdmlal_s32(a, b, c);
-+}
-+
-+// ****** Vector multiply subtract by scalar *****************
-+// *************************************************************
-+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmls_s16(a, b, c);
-+}
-+
-+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vmls_s32(a, b, c);
-+}
-+
-+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmls_s16(a, b, c);
-+}
-+
-+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdup_n_u32(vlane);
-+ return vmls_u32(a, b, c);
-+}
-+
-+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
-+{
-+ float32_t vlane;
-+ float32x2_t c;
-+ vlane = (float) vget_lane_f32(v, l);
-+ c = vdup_n_f32(vlane);
-+ return vmls_f32(a,b,c);
-+}
-+
-+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x8_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdupq_n_s16(vlane);
-+ return vmlsq_s16(a, b,c);
-+}
-+
-+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x4_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdupq_n_s32(vlane);
-+ return vmlsq_s32(a,b,c);
-+}
-+
-+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x8_t c;
-+ vlane = vget_lane_u16(v, l);
-+ c = vdupq_n_u16(vlane);
-+ return vmlsq_u16(a,b,c);
-+}
-+
-+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x4_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdupq_n_u32(vlane);
-+ return vmlsq_u32(a,b,c);
-+}
-+
-+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
-+{
-+ float32_t vlane;
-+ float32x4_t c;
-+ vlane = (float) vget_lane_f32(v, l);
-+ c = vdupq_n_f32(vlane);
-+ return vmlsq_f32(a,b,c);
-+}
-+
-+// **** Vector widening multiply subtract by scalar ****
-+// ****************************************************
-+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmlsl_s16(a, b, c);
-+}
-+
-+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vmlsl_s32(a, b, c);
-+}
-+
-+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmlsl_s16(a, b, c);
-+}
-+
-+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdup_n_u32(vlane);
-+ return vmlsl_u32(a, b, c);
-+}
-+
-+//********* Vector widening saturating doubling multiply subtract by scalar **************************
-+//******************************************************************************************************
-+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vqdmlsl_s16(a, b, c);
-+}
-+
-+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vqdmlsl_s32(a, b, c);
-+}
-+//********** Vector multiply with scalar *****************************
-+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
-+{
-+ int16x4_t b16x4;
-+ b16x4 = vdup_n_s16(b);
-+ return vmul_s16(a, b16x4);
-+}
-+
-+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
-+_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
-+{
-+ //serial solution looks faster
-+ int32x2_t b32x2;
-+ b32x2 = vdup_n_s32(b);
-+ return vmul_s32(a, b32x2);
-+}
-+
-+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
-+_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
-+{
-+ float32x2_t b32x2;
-+ b32x2 = vdup_n_f32(b);
-+ return vmul_f32(a, b32x2);
-+}
-+
-+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
-+_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
-+{
-+ uint16x4_t b16x4;
-+ b16x4 = vdup_n_s16(b);
-+ return vmul_s16(a, b16x4);
-+}
-+
-+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
-+_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
-+{
-+ //serial solution looks faster
-+ uint32x2_t b32x2;
-+ b32x2 = vdup_n_u32(b);
-+ return vmul_u32(a, b32x2);
-+}
-+
-+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
-+{
-+ int16x8_t b16x8;
-+ b16x8 = vdupq_n_s16(b);
-+ return vmulq_s16(a, b16x8);
-+}
-+
-+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
-+{
-+ int32x4_t b32x4;
-+ b32x4 = vdupq_n_s32(b);
-+ return vmulq_s32(a, b32x4);
-+}
-+
-+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
-+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
-+{
-+ float32x4_t b32x4;
-+ b32x4 = vdupq_n_f32(b);
-+ return vmulq_f32(a, b32x4);
-+}
-+
-+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
-+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
-+{
-+ uint16x8_t b16x8;
-+ b16x8 = vdupq_n_s16(b);
-+ return vmulq_s16(a, b16x8);
-+}
-+
-+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
-+{
-+ uint32x4_t b32x4;
-+ b32x4 = vdupq_n_u32(b);
-+ return vmulq_u32(a, b32x4);
-+}
-+
-+//********** Vector multiply lane *****************************
-+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
-+_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
-+{
-+ int16x4_t b16x4;
-+ int16_t vlane;
-+ vlane = vget_lane_s16(b, c);
-+ b16x4 = vdup_n_s16(vlane);
-+ return vmul_s16(a, b16x4);
-+}
-+
-+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
-+{
-+ int32x2_t b32x2;
-+ int32_t vlane;
-+ vlane = vget_lane_s32(b, c);
-+ b32x2 = vdup_n_s32(vlane);
-+ return vmul_s32(a, b32x2);
-+}
-+
-+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
-+{
-+ float32x2_t b32x2;
-+ float32_t vlane;
-+ vlane = vget_lane_f32(b, c);
-+ b32x2 = vdup_n_f32(vlane);
-+ return vmul_f32(a, b32x2);
-+}
-+
-+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
-+#define vmul_lane_u16 vmul_lane_s16
-+
-+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
-+#define vmul_lane_u32 vmul_lane_s32
-+
-+int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
-+_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
-+{
-+ int16x8_t b16x8;
-+ int16_t vlane;
-+ vlane = vget_lane_s16(b, c);
-+ b16x8 = vdupq_n_s16(vlane);
-+ return vmulq_s16(a, b16x8);
-+}
-+
-+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
-+{
-+ int32x4_t b32x4;
-+ int32_t vlane;
-+ vlane = vget_lane_s32(b, c);
-+ b32x4 = vdupq_n_s32(vlane);
-+ return vmulq_s32(a, b32x4);
-+}
-+
-+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
-+{
-+ float32x4_t b32x4;
-+ float32_t vlane;
-+ vlane = vget_lane_f32(b, c);
-+ b32x4 = vdupq_n_f32(vlane);
-+ return vmulq_f32(a, b32x4);
-+}
-+
-+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
-+#define vmulq_lane_u16 vmulq_lane_s16
-+
-+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
-+#define vmulq_lane_u32 vmulq_lane_s32
-+
-+//**** Vector long multiply with scalar ************
-+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
-+{
-+ int16x4_t b16x4;
-+ b16x4 = vdup_n_s16(val2);
-+ return vmull_s16(vec1, b16x4);
-+}
-+
-+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
-+{
-+ int32x2_t b32x2;
-+ b32x2 = vdup_n_s32(val2);
-+ return vmull_s32(vec1, b32x2);
-+}
-+
-+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
-+{
-+ uint16x4_t b16x4;
-+ b16x4 = vdup_n_s16(val2);
-+ return vmull_s16(vec1, b16x4);
-+}
-+
-+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
-+{
-+ uint32x2_t b32x2;
-+ b32x2 = vdup_n_u32(val2);
-+ return vmull_u32(vec1, b32x2);
-+}
-+
-+//**** Vector long multiply by scalar ****
-+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t b;
-+ vlane = vget_lane_s16(val2, val3);
-+ b = vdup_n_s16(vlane);
-+ return vmull_s16(vec1, b);
-+}
-+
-+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t b;
-+ vlane = vget_lane_s32(val2, val3);
-+ b = vdup_n_s32(vlane);
-+ return vmull_s32(vec1, b);
-+}
-+
-+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t b;
-+ vlane = vget_lane_s16(val2, val3);
-+ b = vdup_n_s16(vlane);
-+ return vmull_s16(vec1, b);
-+}
-+
-+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t b;
-+ vlane = vget_lane_u32(val2, val3);
-+ b = vdup_n_u32(vlane);
-+ return vmull_u32(vec1, b);
-+}
-+
-+//********* Vector saturating doubling long multiply with scalar *******************
-+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
-+{
-+ //the serial soulution may be faster due to saturation
-+ int16x4_t b;
-+ b = vdup_n_s16(val2);
-+ return vqdmull_s16(vec1, b);
-+}
-+
-+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t b;
-+ b = vdup_n_s32(val2);
-+ return vqdmull_s32(vec1,b); //slow serial function!!!!
-+}
-+
-+//************* Vector saturating doubling long multiply by scalar ***********************************************
-+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
-+{
-+ int16_t c;
-+ int16x4_t scalar;
-+ c = vget_lane_s16(val2, val3);
-+ scalar = vdup_n_s16(c);
-+ return vqdmull_s16(vec1, scalar);
-+}
-+
-+
-+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32_t c;
-+ int32x2_t scalar;
-+ c = vget_lane_s32(val2, val3);
-+ scalar = vdup_n_s32(c);
-+ return vqdmull_s32(vec1,scalar); //slow serial function!!!!
-+}
-+
-+// *****Vector saturating doubling multiply high with scalar *****
-+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2)
-+{
-+ int16x4_t res64;
-+ return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
-+}
-+
-+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2)
-+{
-+ int32x2_t res64;
-+ return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
-+}
-+
-+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16x8_t scalar;
-+ scalar = vdupq_n_s16(val2);
-+ return vqdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32x4_t scalar;
-+ scalar = vdupq_n_s32(val2);
-+ return vqdmulhq_s32(vec1, scalar);
-+}
-+
-+//***** Vector saturating doubling multiply high by scalar ****************
-+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x4_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdup_n_s16(vlane);
-+ return vqdmulh_s16(vec1, scalar);
-+}
-+
-+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32_t vlane;
-+ int32x2_t scalar;
-+ vlane = vget_lane_s32(val2, val3);
-+ scalar = vdup_n_s32(vlane);
-+ return vqdmulh_s32(vec1, scalar);
-+}
-+
-+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x8_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdupq_n_s16(vlane );
-+ return vqdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //solution may be not optimal
-+ int32_t vlane;
-+ int32x4_t scalar;
-+ vlane = vgetq_lane_s32(_pM128i(val2), val3);
-+ scalar = vdupq_n_s32(vlane );
-+ return vqdmulhq_s32(vec1, scalar);
-+}
-+
-+//******** Vector saturating rounding doubling multiply high with scalar ***
-+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16x4_t scalar;
-+ scalar = vdup_n_s16(val2);
-+ return vqrdmulh_s16(vec1, scalar);
-+}
-+
-+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32x2_t scalar;
-+ scalar = vdup_n_s32(val2);
-+ return vqrdmulh_s32(vec1, scalar);
-+}
-+
-+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16x8_t scalar;
-+ scalar = vdupq_n_s16(val2);
-+ return vqrdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32x4_t scalar;
-+ scalar = vdupq_n_s32(val2);
-+ return vqrdmulhq_s32(vec1, scalar);
-+}
-+
-+//********* Vector rounding saturating doubling multiply high by scalar ****
-+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x4_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdup_n_s16(vlane);
-+ return vqrdmulh_s16(vec1, scalar);
-+}
-+
-+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32_t vlane;
-+ int32x2_t scalar;
-+ vlane = vget_lane_s32(val2, val3);
-+ scalar = vdup_n_s32(vlane);
-+ return vqrdmulh_s32(vec1, scalar);
-+}
-+
-+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x8_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdupq_n_s16(vlane);
-+ return vqrdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //solution may be not optimal
-+ int32_t vlane;
-+ int32x4_t scalar;
-+ vlane = vgetq_lane_s32(_pM128i(val2), val3);
-+ scalar = vdupq_n_s32(vlane );
-+ return vqrdmulhq_s32(vec1, scalar);
-+}
-+
-+//**************Vector multiply accumulate with scalar *******************
-+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
-+{
-+ int16x4_t scalar;
-+ scalar = vdup_n_s16(c);
-+ return vmla_s16(a, b, scalar);
-+}
-+
-+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
-+{
-+ int32x2_t scalar;
-+ scalar = vdup_n_s32(c);
-+ return vmla_s32(a, b, scalar);
-+}
-+
-+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
-+#define vmla_n_u16 vmla_n_s16
-+
-+
-+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
-+#define vmla_n_u32 vmla_n_s32
-+
-+
-+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
-+{
-+ float32x2_t scalar;
-+ scalar = vdup_n_f32(c);
-+ return vmla_f32(a, b, scalar);
-+}
-+
-+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
-+{
-+ int16x8_t scalar;
-+ scalar = vdupq_n_s16(c);
-+ return vmlaq_s16(a,b,scalar);
-+}
-+
-+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
-+{
-+ int32x4_t scalar;
-+ scalar = vdupq_n_s32(c);
-+ return vmlaq_s32(a,b,scalar);
-+}
-+
-+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
-+#define vmlaq_n_u16 vmlaq_n_s16
-+
-+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
-+#define vmlaq_n_u32 vmlaq_n_s32
-+
-+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
-+{
-+ float32x4_t scalar;
-+ scalar = vdupq_n_f32(c);
-+ return vmlaq_f32(a,b,scalar);
-+}
-+
-+//************Vector widening multiply accumulate with scalar****************************
-+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmlal_s16(a, b, vc);
-+}
-+
-+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vmlal_s32(a, b, vc);
-+}
-+
-+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
-+{
-+ uint16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmlal_s16(a, b, vc);
-+}
-+
-+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
-+{
-+ uint32x2_t vc;
-+ vc = vdup_n_u32(c);
-+ return vmlal_u32(a, b, vc);
-+}
-+
-+//************ Vector widening saturating doubling multiply accumulate with scalar **************
-+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
-+{
-+ //not optimal SIMD soulution, serial may be faster
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vqdmlal_s16(a, b, vc);
-+}
-+
-+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vqdmlal_s32(a, b, vc);
-+}
-+
-+//******** Vector multiply subtract with scalar **************
-+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmls_s16(a, b, vc);
-+}
-+
-+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vmls_s32(a, b, vc);
-+}
-+
-+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
-+{
-+ uint16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmls_s16(a, b, vc);
-+}
-+
-+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
-+{
-+ uint32x2_t vc;
-+ vc = vdup_n_u32(c);
-+ return vmls_u32(a, b, vc);
-+}
-+
-+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
-+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
-+ return res;
-+}
-+
-+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
-+{
-+ int16x8_t vc;
-+ vc = vdupq_n_s16(c);
-+ return vmlsq_s16(a, b,vc);
-+}
-+
-+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
-+{
-+ int32x4_t vc;
-+ vc = vdupq_n_s32(c);
-+ return vmlsq_s32(a,b,vc);
-+}
-+
-+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
-+{
-+ uint32x4_t vc;
-+ vc = vdupq_n_u32(c);
-+ return vmlsq_u32(a,b,vc);
-+}
-+
-+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
-+{
-+ uint32x4_t vc;
-+ vc = vdupq_n_u32(c);
-+ return vmlsq_u32(a,b,vc);
-+}
-+
-+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
-+{
-+ float32x4_t vc;
-+ vc = vdupq_n_f32(c);
-+ return vmlsq_f32(a,b,vc);
-+}
-+
-+//**** Vector widening multiply subtract with scalar ******
-+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmlsl_s16(a, b, vc);
-+}
-+
-+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vmlsl_s32(a, b, vc);
-+}
-+
-+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
-+{
-+ uint16x4_t vc;
-+ vc = vdup_n_u16(c);
-+ return vmlsl_u16(a, b, vc);
-+}
-+
-+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
-+{
-+ uint32x2_t vc;
-+ vc = vdup_n_u32(c);
-+ return vmlsl_u32(a, b, vc);
-+}
-+
-+//***** Vector widening saturating doubling multiply subtract with scalar *********
-+//**********************************************************************************
-+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vqdmlsl_s16(a, b, vc);
-+}
-+
-+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vqdmlsl_s32(a, b, vc);
-+}
-+
-+//******************* Vector extract ***********************************************
-+//*************************************************************************************
-+//VEXT (Vector Extract) extracts elements from the bottom end of the second operand
-+//vector and the top end of the first, concatenates them, and places the result in the destination vector
-+//c elements from the bottom end of the second operand and (8-c) from the top end of the first
-+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int8x8_t res;
-+ int i;
-+ for (i = 0; i<8 - c; i++) {
-+ res.m64_i8[i] = a.m64_i8[i + c];
-+ }
-+ for(i = 0; i<c; i++) {
-+ res.m64_i8[8 - c + i] = b.m64_i8[i];
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+#define vext_u8 vext_s8
-+//same result tested
-+
-+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+#define vext_p8 vext_u8
-+
-+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int16x4_t res;
-+ int i;
-+ for (i = 0; i<4 - c; i++) {
-+ res.m64_i16[i] = a.m64_i16[i + c];
-+ }
-+ for(i = 0; i<c; i++) {
-+ res.m64_i16[4 - c + i] = b.m64_i16[i];
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+#define vext_u16 vext_s16
-+
-+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+#define vext_p16 vext_s16
-+
-+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t res;
-+ if (c==0) {
-+ res.m64_i32[0] = a.m64_i32[0];
-+ res.m64_i32[1] = a.m64_i32[1];
-+ } else {
-+ res.m64_i32[0] = a.m64_i32[1];
-+ res.m64_i32[1] = b.m64_i32[0];
-+ }
-+ return res;
-+}
-+
-+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ float32x2_t res;
-+ if (c==0) {
-+ res.m64_f32[0] = a.m64_f32[0];
-+ res.m64_f32[1] = a.m64_f32[1];
-+ } else {
-+ res.m64_f32[0] = a.m64_f32[1];
-+ res.m64_f32[1] = b.m64_f32[0];
-+ }
-+ return res;
-+}
-+
-+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+#define vext_u32 vext_s32
-+
-+
-+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+#define vext_s64(a,b,c) a
-+
-+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+#define vext_u64(a,b,c) a
-+
-+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+#define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
-+
-+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+#define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
-+
-+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+#define vextq_p8 vextq_s8
-+
-+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+#define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
-+
-+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+#define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
-+
-+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+#define vextq_p16 vextq_s16
-+
-+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+#define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
-+
-+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+#define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
-+
-+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
-+#define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
-+
-+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+#define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
-+
-+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+#define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
-+
-+//************ Reverse vector elements (swap endianness)*****************
-+//*************************************************************************
-+//VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
-+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vrev64q_s8(_pM128i(vec));
-+ return64(res);
-+}
-+
-+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vrev64q_s16(_pM128i(vec));
-+ return64(res);
-+}
-+
-+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
-+{
-+ int32x2_t res;
-+ res.m64_i32[0] = vec.m64_i32[1];
-+ res.m64_i32[1] = vec.m64_i32[0];
-+ return res;
-+}
-+
-+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
-+#define vrev64_u8 vrev64_s8
-+
-+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
-+#define vrev64_u16 vrev64_s16
-+
-+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
-+#define vrev64_u32 vrev64_s32
-+
-+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
-+#define vrev64_p8 vrev64_u8
-+
-+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
-+#define vrev64_p16 vrev64_u16
-+
-+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = vec.m64_f32[1];
-+ res.m64_f32[1] = vec.m64_f32[0];
-+ return res;
-+}
-+
-+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
-+}
-+
-+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
-+{
-+ //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
-+}
-+
-+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
-+{
-+ return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
-+}
-+
-+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
-+#define vrev64q_u8 vrev64q_s8
-+
-+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
-+#define vrev64q_u16 vrev64q_s16
-+
-+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
-+#define vrev64q_u32 vrev64q_s32
-+
-+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
-+#define vrev64q_p8 vrev64q_u8
-+
-+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
-+#define vrev64q_p16 vrev64q_u16
-+
-+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
-+#define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1))
-+
-+//******************** 32 bit shuffles **********************
-+//************************************************************
-+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vrev32q_s8(_pM128i(vec));
-+ return64(res);
-+}
-+
-+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vrev32q_s16(_pM128i(vec));
-+ return64(res);
-+}
-+
-+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
-+#define vrev32_u8 vrev32_s8
-+
-+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
-+#define vrev32_u16 vrev32_s16
-+
-+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
-+#define vrev32_p8 vrev32_u8
-+
-+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
-+#define vrev32_p16 vrev32_u16
-+
-+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
-+}
-+
-+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
-+}
-+
-+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
-+#define vrev32q_u8 vrev32q_s8
-+
-+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
-+#define vrev32q_u16 vrev32q_s16
-+
-+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
-+#define vrev32q_p8 vrev32q_u8
-+
-+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
-+#define vrev32q_p16 vrev32q_u16
-+
-+//************* 16 bit shuffles **********************
-+//******************************************************
-+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vrev16q_s8(_pM128i(vec));
-+ return64(res);
-+}
-+
-+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
-+#define vrev16_u8 vrev16_s8
-+
-+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
-+#define vrev16_p8 vrev16_u8
-+
-+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev8);
-+}
-+
-+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
-+#define vrev16q_u8 vrev16q_s8
-+
-+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
-+#define vrev16q_p8 vrev16q_u8
-+
-+//*********************************************************************
-+//**************** Other single operand arithmetic *******************
-+//*********************************************************************
-+
-+//*********** Absolute: Vd[i] = |Va[i]| **********************************
-+//************************************************************************
-+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vabs_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _mm_abs_epi8(_pM128i(a));
-+ return64(res);
-+}
-+
-+
-+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vabs_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = _mm_abs_epi16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vabs_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_abs_epi32(_pM128i(a));
-+ return64(res);
-+}
-+
-+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-+ res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
-+#define vabsq_s8 _mm_abs_epi8
-+
-+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
-+#define vabsq_s16 _mm_abs_epi16
-+
-+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
-+#define vabsq_s32 _mm_abs_epi32
-+
-+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
-+_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-+ return _mm_and_ps (a, *(__m128*)c7fffffff);
-+}
-+
-+//****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
-+//**********************************************************************
-+//For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
-+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vqabsq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vqabsq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vqabsq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
-+{
-+ __m128i c_128, abs, abs_cmp;
-+ c_128 = _mm_set1_epi8 (0x80); //-128
-+ abs = _mm_abs_epi8 (a);
-+ abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
-+ return _mm_xor_si128 (abs, abs_cmp);
-+}
-+
-+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
-+{
-+ __m128i c_32768, abs, abs_cmp;
-+ c_32768 = _mm_set1_epi16 (0x8000); //-32768
-+ abs = _mm_abs_epi16 (a);
-+ abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
-+ return _mm_xor_si128 (abs, abs_cmp);
-+}
-+
-+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
-+{
-+ __m128i c80000000, abs, abs_cmp;
-+ c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
-+ abs = _mm_abs_epi32 (a);
-+ abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
-+ return _mm_xor_si128 (abs, abs_cmp);
-+}
-+
-+//*************** Negate: Vd[i] = - Va[i] *************************************
-+//*****************************************************************************
-+//several Negate implementations possible for SIMD.
-+//e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
-+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vnegq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vnegq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vnegq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_sub_epi8 (zero, a);
-+} //or _mm_sign_epi8 (a, negative numbers vector)
-+
-+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_sub_epi16 (zero, a);
-+} //or _mm_sign_epi16 (a, negative numbers vector)
-+
-+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_sub_epi32 (zero, a);
-+} //or _mm_sign_epi32 (a, negative numbers vector)
-+
-+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ return _mm_xor_ps (a, *(__m128*) c80000000);
-+}
-+
-+//************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
-+//***************************************************************************************
-+//For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
-+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
-+_NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vqnegq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
-+_NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vqnegq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
-+_NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vqnegq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
-+_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_subs_epi8 (zero, a); //saturating substraction
-+}
-+
-+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
-+_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_subs_epi16 (zero, a); //saturating substraction
-+}
-+
-+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
-+_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
-+{
-+ //solution may be not optimal compared with a serial
-+ __m128i c80000000, zero, sub, cmp;
-+ c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
-+ zero = _mm_setzero_si128 ();
-+ sub = _mm_sub_epi32 (zero, a); //substraction
-+ cmp = _mm_cmpeq_epi32 (a, c80000000);
-+ return _mm_xor_si128 (sub, cmp);
-+}
-+
-+//****************** Count leading zeros ********************************
-+//**************************************************************************
-+//no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
-+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vclzq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vclzq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vclzq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+
-+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
-+#define vclz_u8 vclz_s8
-+
-+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
-+#define vclz_u16 vclz_s16
-+
-+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
-+#define vclz_u32 vclz_s32
-+
-+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
-+ /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
-+ /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
-+ /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0 };
-+ __m128i maskLOW, c4, lowclz, mask, hiclz;
-+ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
-+ c4 = _mm_set1_epi8(4);
-+ lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
-+ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
-+ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
-+ hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
-+ mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
-+ lowclz = _mm_and_si128(lowclz,mask);
-+ return _mm_add_epi8(lowclz, hiclz);
-+}
-+
-+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
-+{
-+ __m128i c7, res8x16, res8x16_swap;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
-+ c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
-+ res8x16 = vclzq_s8(a);
-+ res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
-+ res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
-+ res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
-+ c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
-+ res8x16 = _mm_and_si128(res8x16, c7); //lowclz
-+ return _mm_add_epi16(res8x16_swap, res8x16);
-+}
-+
-+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
-+{
-+ __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
-+ c55555555 = _mm_set1_epi32(0x55555555);
-+ c33333333 = _mm_set1_epi32(0x33333333);
-+ c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
-+ c3f = _mm_set1_epi32(0x3f);
-+ c32 = _mm_set1_epi32(32);
-+ tmp = _mm_srli_epi32(a, 1);
-+ res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
-+ tmp = _mm_srli_epi32(res, 2);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
-+ tmp = _mm_srli_epi32(res, 4);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
-+ tmp = _mm_srli_epi32(res, 8);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
-+ tmp = _mm_srli_epi32(res, 16);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
-+
-+ tmp = _mm_srli_epi32(res, 1);
-+ tmp = _mm_and_si128(tmp, c55555555);
-+ res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
-+
-+ tmp = _mm_srli_epi32(res, 2);
-+ tmp = _mm_and_si128(tmp, c33333333);
-+ tmp1 = _mm_and_si128(res, c33333333);
-+ res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
-+
-+ tmp = _mm_srli_epi32(res, 4);
-+ tmp = _mm_add_epi32(tmp, res);
-+ res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
-+
-+ tmp = _mm_srli_epi32(res, 8);
-+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
-+
-+ tmp = _mm_srli_epi32(res, 16);
-+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
-+
-+ res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
-+
-+ return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
-+}
-+
-+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
-+#define vclzq_u8 vclzq_s8
-+
-+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
-+#define vclzq_u16 vclzq_s16
-+
-+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
-+#define vclzq_u32 vclzq_s32
-+
-+//************** Count leading sign bits **************************
-+//********************************************************************
-+//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
-+// the topmost bit, that are the same as the topmost bit, in each element in a vector
-+//No corresponding vector intrinsics in IA32, need to implement it.
-+//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
-+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vclsq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vclsq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vclsq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
-+{
-+ __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
-+ cff = _mm_cmpeq_epi8 (a,a); //0xff
-+ c80 = _mm_set1_epi8(0x80);
-+ c1 = _mm_set1_epi8(1);
-+ a_mask = _mm_and_si128(a, c80);
-+ a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
-+ a_neg = _mm_xor_si128(a, cff);
-+ a_neg = _mm_and_si128(a_mask, a_neg);
-+ a_pos = _mm_andnot_si128(a_mask, a);
-+ a_comb = _mm_or_si128(a_pos, a_neg);
-+ a_comb = vclzq_s8(a_comb);
-+ return _mm_sub_epi8(a_comb, c1);
-+}
-+
-+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
-+{
-+ __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
-+ cffff = _mm_cmpeq_epi16(a,a);
-+ c8000 = _mm_slli_epi16(cffff, 15); //0x8000
-+ c1 = _mm_srli_epi16(cffff,15); //0x1
-+ a_mask = _mm_and_si128(a, c8000);
-+ a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
-+ a_neg = _mm_xor_si128(a, cffff);
-+ a_neg = _mm_and_si128(a_mask, a_neg);
-+ a_pos = _mm_andnot_si128(a_mask, a);
-+ a_comb = _mm_or_si128(a_pos, a_neg);
-+ a_comb = vclzq_s16(a_comb);
-+ return _mm_sub_epi16(a_comb, c1);
-+}
-+
-+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
-+{
-+ __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
-+ cffffffff = _mm_cmpeq_epi32(a,a);
-+ c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000
-+ c1 = _mm_srli_epi32(cffffffff,31); //0x1
-+ a_mask = _mm_and_si128(a, c80000000);
-+ a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
-+ a_neg = _mm_xor_si128(a, cffffffff);
-+ a_neg = _mm_and_si128(a_mask, a_neg);
-+ a_pos = _mm_andnot_si128(a_mask, a);
-+ a_comb = _mm_or_si128(a_pos, a_neg);
-+ a_comb = vclzq_s32(a_comb);
-+ return _mm_sub_epi32(a_comb, c1);
-+}
-+
-+//************************* Count number of set bits ********************************
-+//*************************************************************************************
-+//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element
-+//another option is to do the following algorithm:
-+
-+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
-+_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
-+{
-+ uint8x8_t res64;
-+ __m128i res;
-+ res = vcntq_u8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
-+#define vcnt_s8 vcnt_u8
-+
-+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
-+#define vcnt_p8 vcnt_u8
-+
-+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
-+_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
-+ /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
-+ /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
-+ /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 };
-+ __m128i maskLOW, mask, lowpopcnt, hipopcnt;
-+ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
-+ mask = _mm_and_si128(a, maskLOW);
-+ lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
-+ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
-+ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
-+ hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
-+ return _mm_add_epi8(lowpopcnt, hipopcnt);
-+}
-+
-+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
-+#define vcntq_s8 vcntq_u8
-+
-+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
-+#define vcntq_p8 vcntq_u8
-+
-+//**************************************************************************************
-+//*********************** Logical operations ****************************************
-+//**************************************************************************************
-+//************************** Bitwise not ***********************************
-+//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
-+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
-+_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vmvnq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
-+_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vmvnq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
-+_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vmvnq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
-+#define vmvn_u8 vmvn_s8
-+
-+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
-+#define vmvn_u16 vmvn_s16
-+
-+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
-+#define vmvn_u32 vmvn_s32
-+
-+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
-+#define vmvn_p8 vmvn_u8
-+
-+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
-+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
-+{
-+ __m128i c1;
-+ c1 = _mm_cmpeq_epi8 (a,a); //0xff
-+ return _mm_andnot_si128 (a, c1);
-+}
-+
-+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
-+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
-+{
-+ __m128i c1;
-+ c1 = _mm_cmpeq_epi16 (a,a); //0xffff
-+ return _mm_andnot_si128 (a, c1);
-+}
-+
-+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
-+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
-+{
-+ __m128i c1;
-+ c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-+ return _mm_andnot_si128 (a, c1);
-+}
-+
-+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
-+#define vmvnq_u8 vmvnq_s8
-+
-+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
-+#define vmvnq_u16 vmvnq_s16
-+
-+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
-+#define vmvnq_u32 vmvnq_s32
-+
-+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
-+#define vmvnq_p8 vmvnq_u8
-+
-+//****************** Bitwise and ***********************
-+//******************************************************
-+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
-+ return res;
-+}
-+
-+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
-+#define vand_u8 vand_s8
-+
-+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
-+#define vand_u16 vand_s16
-+
-+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
-+#define vand_u32 vand_s32
-+
-+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
-+#define vand_u64 vand_s64
-+
-+
-+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
-+#define vandq_s8 _mm_and_si128
-+
-+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
-+#define vandq_s16 _mm_and_si128
-+
-+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
-+#define vandq_s32 _mm_and_si128
-+
-+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
-+#define vandq_s64 _mm_and_si128
-+
-+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
-+#define vandq_u8 _mm_and_si128
-+
-+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
-+#define vandq_u16 _mm_and_si128
-+
-+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
-+#define vandq_u32 _mm_and_si128
-+
-+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
-+#define vandq_u64 _mm_and_si128
-+
-+//******************** Bitwise or *********************************
-+//******************************************************************
-+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
-+ return res;
-+}
-+
-+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
-+#define vorr_u8 vorr_s8
-+
-+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
-+#define vorr_u16 vorr_s16
-+
-+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
-+#define vorr_u32 vorr_s32
-+
-+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
-+#define vorr_u64 vorr_s64
-+
-+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
-+#define vorrq_s8 _mm_or_si128
-+
-+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
-+#define vorrq_s16 _mm_or_si128
-+
-+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
-+#define vorrq_s32 _mm_or_si128
-+
-+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
-+#define vorrq_s64 _mm_or_si128
-+
-+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
-+#define vorrq_u8 _mm_or_si128
-+
-+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
-+#define vorrq_u16 _mm_or_si128
-+
-+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
-+#define vorrq_u32 _mm_or_si128
-+
-+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
-+#define vorrq_u64 _mm_or_si128
-+
-+//************* Bitwise exclusive or (EOR or XOR) ******************
-+//*******************************************************************
-+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
-+#define veor_s16 veor_s8
-+
-+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
-+#define veor_s32 veor_s8
-+
-+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
-+ return res;
-+}
-+
-+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
-+#define veor_u8 veor_s8
-+
-+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
-+#define veor_u16 veor_s16
-+
-+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
-+#define veor_u32 veor_s32
-+
-+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
-+#define veor_u64 veor_s64
-+
-+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
-+#define veorq_s8 _mm_xor_si128
-+
-+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
-+#define veorq_s16 _mm_xor_si128
-+
-+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
-+#define veorq_s32 _mm_xor_si128
-+
-+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
-+#define veorq_s64 _mm_xor_si128
-+
-+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
-+#define veorq_u8 _mm_xor_si128
-+
-+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
-+#define veorq_u16 _mm_xor_si128
-+
-+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
-+#define veorq_u32 _mm_xor_si128
-+
-+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
-+#define veorq_u64 _mm_xor_si128
-+
-+//********************** Bit Clear **********************************
-+//*******************************************************************
-+//Logical AND complement (AND negation or AND NOT)
-+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
-+}
-+
-+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
-+#define vbic_s16 vbic_s8
-+
-+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
-+#define vbic_s32 vbic_s8
-+
-+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
-+ return res;
-+}
-+
-+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
-+#define vbic_u8 vbic_s8
-+
-+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
-+#define vbic_u16 vbic_s16
-+
-+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
-+#define vbic_u32 vbic_s32
-+
-+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
-+#define vbic_u64 vbic_s64
-+
-+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
-+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
-+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
-+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
-+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
-+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
-+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
-+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
-+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+//**************** Bitwise OR complement ********************************
-+//**************************************** ********************************
-+//no exact IA 32 match, need to implement it as following
-+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vornq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vornq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vornq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
-+ return res;
-+}
-+
-+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
-+#define vorn_u8 vorn_s8
-+
-+
-+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
-+#define vorn_u16 vorn_s16
-+
-+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
-+#define vorn_u32 vorn_s32
-+
-+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
-+#define vorn_u64 vorn_s64
-+
-+
-+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s8( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s16( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s32( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
-+{
-+ __m128i c1, b1;
-+ c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
-+ b1 = _mm_andnot_si128 (b, c1);
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_u8( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s16( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_u32( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
-+#define vornq_u64 vornq_s64
-+
-+//********************* Bitwise Select *****************************
-+//******************************************************************
-+//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
-+
-+//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
-+//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
-+
-+//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
-+//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
-+
-+//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
-+//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
-+
-+//VBSL only is implemented for SIMD
-+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
-+ return64(res);
-+}
-+
-+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
-+#define vbsl_s16 vbsl_s8
-+
-+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
-+#define vbsl_s32 vbsl_s8
-+
-+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
-+ return res;
-+}
-+
-+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
-+#define vbsl_u8 vbsl_s8
-+
-+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
-+#define vbsl_u16 vbsl_s8
-+
-+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
-+#define vbsl_u32 vbsl_s8
-+
-+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
-+#define vbsl_u64 vbsl_s64
-+
-+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
-+{
-+ __m128 sel1, sel2;
-+ __m64_128 res64;
-+ sel1 = _mm_and_ps (_pM128(a), _pM128(b));
-+ sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
-+ sel1 = _mm_or_ps (sel1, sel2);
-+ _M64f(res64, sel1);
-+ return res64;
-+}
-+
-+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
-+#define vbsl_p8 vbsl_s8
-+
-+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
-+#define vbsl_p16 vbsl_s8
-+
-+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
-+{
-+ __m128i sel1, sel2;
-+ sel1 = _mm_and_si128 (a, b);
-+ sel2 = _mm_andnot_si128 (a, c);
-+ return _mm_or_si128 (sel1, sel2);
-+}
-+
-+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
-+#define vbslq_s16 vbslq_s8
-+
-+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
-+#define vbslq_s32 vbslq_s8
-+
-+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
-+#define vbslq_s64 vbslq_s8
-+
-+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
-+#define vbslq_u8 vbslq_s8
-+
-+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
-+#define vbslq_u16 vbslq_s8
-+
-+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
-+#define vbslq_u32 vbslq_s8
-+
-+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
-+#define vbslq_u64 vbslq_s8
-+
-+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
-+{
-+ __m128 sel1, sel2;
-+ sel1 = _mm_and_ps (*(__m128*)&a, b);
-+ sel2 = _mm_andnot_ps (*(__m128*)&a, c);
-+ return _mm_or_ps (sel1, sel2);
-+}
-+
-+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
-+#define vbslq_p8 vbslq_u8
-+
-+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
-+#define vbslq_p16 vbslq_s8
-+
-+//************************************************************************************
-+//**************** Transposition operations ****************************************
-+//************************************************************************************
-+//***************** Vector Transpose ************************************************
-+//************************************************************************************
-+//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
-+// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
-+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
-+_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
-+{
-+ int8x8x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp
-+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
-+ vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
-+ return val;
-+}
-+
-+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
-+_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
-+{
-+ int16x4x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
-+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
-+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
-+ return val;
-+}
-+
-+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
-+_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
-+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
-+ return val;
-+}
-+
-+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
-+#define vtrn_u8 vtrn_s8
-+
-+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
-+#define vtrn_u16 vtrn_s16
-+
-+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
-+#define vtrn_u32 vtrn_s32
-+
-+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
-+_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x2x2_t val;
-+ val.val[0].m64_f32[0] = a.m64_f32[0];
-+ val.val[0].m64_f32[1] = b.m64_f32[0];
-+ val.val[1].m64_f32[0] = a.m64_f32[1];
-+ val.val[1].m64_f32[1] = b.m64_f32[1];
-+ return val; //a0,b0,a1,b1
-+}
-+
-+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
-+#define vtrn_p8 vtrn_u8
-+
-+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
-+#define vtrn_p16 vtrn_s16
-+
-+//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
-+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
-+{
-+ int8x16x2_t r8x16;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
-+
-+ r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
-+ r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
-+ return r8x16;
-+}
-+
-+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
-+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
-+{
-+ int16x8x2_t v16x8;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
-+ v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
-+ v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
-+ return v16x8;
-+}
-+
-+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
-+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
-+{
-+ //may be not optimal solution compared with serial
-+ int32x4x2_t v32x4;
-+ __m128i a_sh, b_sh;
-+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
-+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
-+
-+ v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
-+ v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3
-+ return v32x4;
-+}
-+
-+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
-+#define vtrnq_u8 vtrnq_s8
-+
-+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
-+#define vtrnq_u16 vtrnq_s16
-+
-+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
-+#define vtrnq_u32 vtrnq_s32
-+
-+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
-+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
-+{
-+ //may be not optimal solution compared with serial
-+ float32x4x2_t f32x4;
-+ __m128 a_sh, b_sh;
-+ a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
-+ b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
-+
-+ f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
-+ f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3
-+ return f32x4;
-+}
-+
-+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
-+#define vtrnq_p8 vtrnq_s8
-+
-+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
-+#define vtrnq_p16 vtrnq_s16
-+
-+//***************** Interleave elements ***************************
-+//*****************************************************************
-+//output has (a0,b0,a1,b1, a2,b2,.....)
-+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
-+_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
-+{
-+ int8x8x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
-+ vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
-+_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
-+{
-+ int16x4x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
-+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
-+#define vzip_s32 vtrn_s32
-+
-+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
-+#define vzip_u8 vzip_s8
-+
-+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
-+#define vzip_u16 vzip_s16
-+
-+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
-+#define vzip_u32 vzip_s32
-+
-+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
-+#define vzip_f32 vtrn_f32
-+
-+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
-+#define vzip_p8 vzip_u8
-+
-+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
-+#define vzip_p16 vzip_u16
-+
-+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
-+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
-+{
-+ int8x16x2_t r8x16;
-+ r8x16.val[0] = _mm_unpacklo_epi8(a, b);
-+ r8x16.val[1] = _mm_unpackhi_epi8(a, b);
-+ return r8x16;
-+}
-+
-+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
-+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
-+{
-+ int16x8x2_t r16x8;
-+ r16x8.val[0] = _mm_unpacklo_epi16(a, b);
-+ r16x8.val[1] = _mm_unpackhi_epi16(a, b);
-+ return r16x8;
-+}
-+
-+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
-+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
-+{
-+ int32x4x2_t r32x4;
-+ r32x4.val[0] = _mm_unpacklo_epi32(a, b);
-+ r32x4.val[1] = _mm_unpackhi_epi32(a, b);
-+ return r32x4;
-+}
-+
-+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
-+#define vzipq_u8 vzipq_s8
-+
-+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
-+#define vzipq_u16 vzipq_s16
-+
-+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
-+#define vzipq_u32 vzipq_s32
-+
-+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
-+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
-+{
-+ float32x4x2_t f32x4;
-+ f32x4.val[0] = _mm_unpacklo_ps ( a, b);
-+ f32x4.val[1] = _mm_unpackhi_ps ( a, b);
-+ return f32x4;
-+}
-+
-+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
-+#define vzipq_p8 vzipq_u8
-+
-+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
-+#define vzipq_p16 vzipq_u16
-+
-+//*********************** De-Interleave elements *************************
-+//*************************************************************************
-+//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
-+//no such functions in IA32 SIMD, shuffle is required
-+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
-+_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
-+{
-+ int8x8x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15};
-+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7)
-+ vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
-+_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
-+{
-+ int16x4x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
-+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
-+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
-+_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
-+{
-+ int32x2x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
-+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
-+#define vuzp_u8 vuzp_s8
-+
-+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
-+#define vuzp_u16 vuzp_s16
-+
-+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
-+#define vuzp_u32 vuzp_s32
-+
-+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
-+#define vuzp_f32 vzip_f32
-+
-+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
-+#define vuzp_p8 vuzp_u8
-+
-+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
-+#define vuzp_p16 vuzp_u16
-+
-+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
-+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
-+{
-+ int8x16x2_t v8x16;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
-+ //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
-+ v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14,
-+ v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15
-+ return v8x16;
-+}
-+
-+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
-+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
-+{
-+ int16x8x2_t v16x8;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
-+ v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
-+ v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
-+ return v16x8;
-+}
-+
-+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
-+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
-+{
-+ //may be not optimal solution compared with serial
-+ int32x4x2_t v32x4;
-+ __m128i a_sh, b_sh;
-+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
-+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
-+
-+ v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
-+ v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
-+ return v32x4;
-+}
-+
-+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
-+#define vuzpq_u8 vuzpq_s8
-+
-+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
-+#define vuzpq_u16 vuzpq_s16
-+
-+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
-+#define vuzpq_u32 vuzpq_s32
-+
-+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
-+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
-+{
-+ float32x4x2_t v32x4;
-+ v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
-+ v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
-+ return v32x4;
-+}
-+
-+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
-+#define vuzpq_p8 vuzpq_u8
-+
-+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
-+#define vuzpq_p16 vuzpq_u16
-+
-+//##############################################################################################
-+//*********************** Reinterpret cast intrinsics.******************************************
-+//##############################################################################################
-+// Not a part of oficial NEON instruction set but available in gcc compiler *********************
-+poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
-+#define vreinterpret_p8_u32
-+
-+poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
-+#define vreinterpret_p8_u16
-+
-+poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
-+#define vreinterpret_p8_u8
-+
-+poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
-+#define vreinterpret_p8_s32
-+
-+poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
-+#define vreinterpret_p8_s16
-+
-+poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
-+#define vreinterpret_p8_s8
-+
-+poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
-+#define vreinterpret_p8_u64
-+
-+poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
-+#define vreinterpret_p8_s64
-+
-+poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
-+#define vreinterpret_p8_f32
-+
-+poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
-+#define vreinterpret_p8_p16
-+
-+poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
-+#define vreinterpretq_p8_u32
-+
-+poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
-+#define vreinterpretq_p8_u16
-+
-+poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
-+#define vreinterpretq_p8_u8
-+
-+poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
-+#define vreinterpretq_p8_s32
-+
-+poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
-+#define vreinterpretq_p8_s16
-+
-+poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
-+#define vreinterpretq_p8_s8
-+
-+poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
-+#define vreinterpretq_p8_u64
-+
-+poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
-+#define vreinterpretq_p8_s64
-+
-+poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
-+#define vreinterpretq_p8_f32(t) _M128i(t)
-+
-+poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
-+#define vreinterpretq_p8_p16
-+
-+poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
-+#define vreinterpret_p16_u32
-+
-+poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
-+#define vreinterpret_p16_u16
-+
-+poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
-+#define vreinterpret_p16_u8
-+
-+poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
-+#define vreinterpret_p16_s32
-+
-+poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
-+#define vreinterpret_p16_s16
-+
-+poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
-+#define vreinterpret_p16_s8
-+
-+poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
-+#define vreinterpret_p16_u64
-+
-+poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
-+#define vreinterpret_p16_s64
-+
-+poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
-+#define vreinterpret_p16_f32
-+
-+poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
-+#define vreinterpret_p16_p8
-+
-+poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
-+#define vreinterpretq_p16_u32
-+
-+poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
-+#define vreinterpretq_p16_u16
-+
-+poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
-+#define vreinterpretq_p16_s32
-+
-+poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
-+#define vreinterpretq_p16_s16
-+
-+poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
-+#define vreinterpretq_p16_s8
-+
-+poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
-+#define vreinterpretq_p16_u64
-+
-+poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
-+#define vreinterpretq_p16_s64
-+
-+poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
-+#define vreinterpretq_p16_f32(t) _M128i(t)
-+
-+poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
-+#define vreinterpretq_p16_p8 vreinterpretq_s16_p8
-+
-+//**** Integer to float ******
-+float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
-+#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t))
-+
-+
-+float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
-+#define vreinterpret_f32_u16 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
-+#define vreinterpret_f32_u8 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_s32 (int32x2_t t);
-+#define vreinterpret_f32_s32 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_s16 (int16x4_t t);
-+#define vreinterpret_f32_s16 vreinterpret_f32_u32
-+
-+float32x2_t vreinterpret_f32_s8 (int8x8_t t);
-+#define vreinterpret_f32_s8 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_u64(uint64x1_t t);
-+#define vreinterpret_f32_u64 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_s64 (int64x1_t t);
-+#define vreinterpret_f32_s64 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
-+#define vreinterpret_f32_p16 vreinterpret_f32_u32
-+
-+float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
-+#define vreinterpret_f32_p8 vreinterpret_f32_u32
-+
-+float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
-+#define vreinterpretq_f32_u32(t) *(__m128*)&(t)
-+
-+float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
-+#define vreinterpretq_f32_u16 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
-+#define vreinterpretq_f32_u8 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
-+#define vreinterpretq_f32_s32 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
-+#define vreinterpretq_f32_s16 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
-+#define vreinterpretq_f32_s8 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
-+#define vreinterpretq_f32_u64 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
-+#define vreinterpretq_f32_s64 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
-+#define vreinterpretq_f32_p16 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
-+#define vreinterpretq_f32_p8 vreinterpretq_f32_u32
-+
-+//*** Integer type conversions ******************
-+//no conversion necessary for the following functions because it is same data type
-+int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
-+#define vreinterpret_s64_u32
-+
-+int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
-+#define vreinterpret_s64_u16
-+
-+int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
-+#define vreinterpret_s64_u8
-+
-+int64x1_t vreinterpret_s64_s32 (int32x2_t t);
-+#define vreinterpret_s64_s32
-+
-+int64x1_t vreinterpret_s64_s16 (int16x4_t t);
-+#define vreinterpret_s64_s16
-+
-+int64x1_t vreinterpret_s64_s8 (int8x8_t t);
-+#define vreinterpret_s64_s8
-+
-+int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
-+#define vreinterpret_s64_u64
-+
-+int64x1_t vreinterpret_s64_f32 (float32x2_t t);
-+#define vreinterpret_s64_f32
-+
-+int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
-+#define vreinterpret_s64_p16
-+
-+int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
-+#define vreinterpret_s64_p8
-+
-+int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
-+#define vreinterpretq_s64_u32
-+
-+int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
-+#define vreinterpretq_s64_s16
-+
-+int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
-+#define vreinterpretq_s64_u8
-+
-+int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
-+#define vreinterpretq_s64_s32
-+
-+int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
-+#define vreinterpretq_s64_u16
-+
-+int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
-+#define vreinterpretq_s64_s8
-+
-+int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
-+#define vreinterpretq_s64_u64
-+
-+int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
-+#define vreinterpretq_s64_f32(t) _M128i(t)
-+
-+int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
-+#define vreinterpretq_s64_p16
-+
-+int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
-+#define vreinterpretq_s64_p8
-+
-+uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
-+#define vreinterpret_u64_u32
-+
-+uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
-+#define vreinterpret_u64_u16
-+
-+uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
-+#define vreinterpret_u64_u8
-+
-+uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
-+#define vreinterpret_u64_s32
-+
-+uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
-+#define vreinterpret_u64_s16
-+
-+uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
-+#define vreinterpret_u64_s8
-+
-+uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
-+#define vreinterpret_u64_s64
-+
-+uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
-+#define vreinterpret_u64_f32
-+
-+uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
-+#define vreinterpret_u64_p16
-+
-+uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
-+#define vreinterpret_u64_p8
-+
-+uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
-+#define vreinterpretq_u64_u32
-+
-+uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
-+#define vreinterpretq_u64_u16
-+
-+uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
-+#define vreinterpretq_u64_u8
-+
-+uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
-+#define vreinterpretq_u64_s32
-+
-+uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
-+#define vreinterpretq_u64_s16
-+
-+uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
-+#define vreinterpretq_u64_s8
-+
-+uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
-+#define vreinterpretq_u64_s64
-+
-+uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
-+#define vreinterpretq_u64_f32(t) _M128i(t)
-+
-+uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
-+#define vreinterpretq_u64_p16
-+
-+uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
-+#define vreinterpretq_u64_p8
-+
-+int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
-+#define vreinterpret_s8_u32
-+
-+int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
-+#define vreinterpret_s8_u16
-+
-+int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
-+#define vreinterpret_s8_u8
-+
-+int8x8_t vreinterpret_s8_s32 (int32x2_t t);
-+#define vreinterpret_s8_s32
-+
-+int8x8_t vreinterpret_s8_s16 (int16x4_t t);
-+#define vreinterpret_s8_s16
-+
-+int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
-+#define vreinterpret_s8_u64
-+
-+int8x8_t vreinterpret_s8_s64 (int64x1_t t);
-+#define vreinterpret_s8_s64
-+
-+int8x8_t vreinterpret_s8_f32 (float32x2_t t);
-+#define vreinterpret_s8_f32
-+
-+int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
-+#define vreinterpret_s8_p16
-+
-+int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
-+#define vreinterpret_s8_p8
-+
-+int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
-+#define vreinterpretq_s8_u32
-+
-+int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
-+#define vreinterpretq_s8_u16
-+
-+int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
-+#define vreinterpretq_s8_u8
-+
-+int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
-+#define vreinterpretq_s8_s32
-+
-+int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
-+#define vreinterpretq_s8_s16
-+
-+int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
-+#define vreinterpretq_s8_u64
-+
-+int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
-+#define vreinterpretq_s8_s64
-+
-+int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
-+#define vreinterpretq_s8_f32(t) _M128i(t)
-+
-+int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
-+#define vreinterpretq_s8_p16
-+
-+int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
-+#define vreinterpretq_s8_p8
-+
-+int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
-+#define vreinterpret_s16_u32
-+
-+int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
-+#define vreinterpret_s16_u16
-+
-+int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
-+#define vreinterpret_s16_u8
-+
-+int16x4_t vreinterpret_s16_s32 (int32x2_t t);
-+#define vreinterpret_s16_s32
-+
-+int16x4_t vreinterpret_s16_s8 (int8x8_t t);
-+#define vreinterpret_s16_s8
-+
-+int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
-+#define vreinterpret_s16_u64
-+
-+int16x4_t vreinterpret_s16_s64 (int64x1_t t);
-+#define vreinterpret_s16_s64
-+
-+int16x4_t vreinterpret_s16_f32 (float32x2_t t);
-+#define vreinterpret_s16_f32
-+
-+
-+int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
-+#define vreinterpret_s16_p16
-+
-+int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
-+#define vreinterpret_s16_p8
-+
-+int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
-+#define vreinterpretq_s16_u32
-+
-+int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
-+#define vreinterpretq_s16_u16
-+
-+int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
-+#define vreinterpretq_s16_u8
-+
-+int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
-+#define vreinterpretq_s16_s32
-+
-+int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
-+#define vreinterpretq_s16_s8
-+
-+int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
-+#define vreinterpretq_s16_u64
-+
-+int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
-+#define vreinterpretq_s16_s64
-+
-+int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
-+#define vreinterpretq_s16_f32(t) _M128i(t)
-+
-+int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
-+#define vreinterpretq_s16_p16
-+
-+int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
-+#define vreinterpretq_s16_p8
-+
-+int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
-+#define vreinterpret_s32_u32
-+
-+int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
-+#define vreinterpret_s32_u16
-+
-+int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
-+#define vreinterpret_s32_u8
-+
-+int32x2_t vreinterpret_s32_s16 (int16x4_t t);
-+#define vreinterpret_s32_s16
-+
-+int32x2_t vreinterpret_s32_s8 (int8x8_t t);
-+#define vreinterpret_s32_s8
-+
-+int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
-+#define vreinterpret_s32_u64
-+
-+int32x2_t vreinterpret_s32_s64 (int64x1_t t);
-+#define vreinterpret_s32_s64
-+
-+int32x2_t vreinterpret_s32_f32 (float32x2_t t);
-+#define vreinterpret_s32_f32
-+
-+int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
-+#define vreinterpret_s32_p16
-+
-+int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
-+#define vreinterpret_s32_p8
-+
-+int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
-+#define vreinterpretq_s32_u32
-+
-+int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
-+#define vreinterpretq_s32_u16
-+
-+int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
-+#define vreinterpretq_s32_u8
-+
-+int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
-+#define vreinterpretq_s32_s16
-+
-+int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
-+#define vreinterpretq_s32_s8
-+
-+int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
-+#define vreinterpretq_s32_u64
-+
-+int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
-+#define vreinterpretq_s32_s64
-+
-+int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
-+#define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t))
-+
-+int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
-+#define vreinterpretq_s32_p16
-+
-+int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
-+#define vreinterpretq_s32_p8
-+
-+uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
-+#define vreinterpret_u8_u32
-+
-+uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
-+#define vreinterpret_u8_u16
-+
-+uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
-+#define vreinterpret_u8_s32
-+
-+uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
-+#define vreinterpret_u8_s16
-+
-+uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
-+#define vreinterpret_u8_s8
-+
-+uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
-+#define vreinterpret_u8_u64
-+
-+uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
-+#define vreinterpret_u8_s64
-+
-+uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
-+#define vreinterpret_u8_f32
-+
-+uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
-+#define vreinterpret_u8_p16
-+
-+uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
-+#define vreinterpret_u8_p8
-+
-+uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
-+#define vreinterpretq_u8_u32
-+
-+uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
-+#define vreinterpretq_u8_u16
-+
-+uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
-+#define vreinterpretq_u8_s32
-+
-+uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
-+#define vreinterpretq_u8_s16
-+
-+uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
-+#define vreinterpretq_u8_s8
-+
-+uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
-+#define vreinterpretq_u8_u64
-+
-+uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
-+#define vreinterpretq_u8_s64
-+
-+uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
-+#define vreinterpretq_u8_f32(t) _M128i(t)
-+
-+
-+uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
-+#define vreinterpretq_u8_p16
-+
-+uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
-+#define vreinterpretq_u8_p8
-+
-+uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
-+#define vreinterpret_u16_u32
-+
-+uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
-+#define vreinterpret_u16_u8
-+
-+uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
-+#define vreinterpret_u16_s32
-+
-+uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
-+#define vreinterpret_u16_s16
-+
-+uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
-+#define vreinterpret_u16_s8
-+
-+uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
-+#define vreinterpret_u16_u64
-+
-+uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
-+#define vreinterpret_u16_s64
-+
-+uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
-+#define vreinterpret_u16_f32
-+
-+uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
-+#define vreinterpret_u16_p16
-+
-+uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
-+#define vreinterpret_u16_p8
-+
-+uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
-+#define vreinterpretq_u16_u32
-+
-+uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
-+#define vreinterpretq_u16_u8
-+
-+uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
-+#define vreinterpretq_u16_s32
-+
-+uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
-+#define vreinterpretq_u16_s16
-+
-+uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
-+#define vreinterpretq_u16_s8
-+
-+uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
-+#define vreinterpretq_u16_u64
-+
-+uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
-+#define vreinterpretq_u16_s64
-+
-+uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
-+#define vreinterpretq_u16_f32(t) _M128i(t)
-+
-+uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
-+#define vreinterpretq_u16_p16
-+
-+uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
-+#define vreinterpretq_u16_p8
-+
-+uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
-+#define vreinterpret_u32_u16
-+
-+uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
-+#define vreinterpret_u32_u8
-+
-+uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
-+#define vreinterpret_u32_s32
-+
-+uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
-+#define vreinterpret_u32_s16
-+
-+uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
-+#define vreinterpret_u32_s8
-+
-+uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
-+#define vreinterpret_u32_u64
-+
-+uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
-+#define vreinterpret_u32_s64
-+
-+uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
-+#define vreinterpret_u32_f32
-+
-+uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
-+#define vreinterpret_u32_p16
-+
-+uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
-+#define vreinterpret_u32_p8
-+
-+uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
-+#define vreinterpretq_u32_u16
-+
-+uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
-+#define vreinterpretq_u32_u8
-+
-+uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
-+#define vreinterpretq_u32_s32
-+
-+uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
-+#define vreinterpretq_u32_s16
-+
-+uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
-+#define vreinterpretq_u32_s8
-+
-+uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
-+#define vreinterpretq_u32_u64
-+
-+uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
-+#define vreinterpretq_u32_s64
-+
-+uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
-+#define vreinterpretq_u32_f32(t) _M128i(t)
-+
-+uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
-+#define vreinterpretq_u32_p16
-+
-+uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
-+#define vreinterpretq_u32_p8
-+
-+#endif /* NEON2SSE_H */
-diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h
-index fee33a3..22fb2ce 100644
---- a/gcc/config/i386/gnu-user.h
-+++ b/gcc/config/i386/gnu-user.h
-@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see
- When the -shared link option is used a final link is not being
- done. */
-
-+#undef ANDROID_TARGET_CC1_SPEC
-+#define ANDROID_TARGET_CC1_SPEC \
-+ " -mssse3 -fno-short-enums " \
-+
- #undef ASM_SPEC
- #define ASM_SPEC \
-- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
-+ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
-
- #undef SUBTARGET_EXTRA_SPECS
- #define SUBTARGET_EXTRA_SPECS \
-diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h
-index 7a02a7e..cac4179 100644
---- a/gcc/config/i386/gnu-user64.h
-+++ b/gcc/config/i386/gnu-user64.h
-@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define SPEC_X32 "mx32"
- #endif
-
-+#undef ANDROID_TARGET_CC1_SPEC
-+#define ANDROID_TARGET_CC1_SPEC \
-+ "%{m32:-mssse3 -fno-short-enums}" \
-+ "%{!m32:-msse4.2 -mpopcnt}"
-+
- #undef ASM_SPEC
- #define ASM_SPEC "%{" SPEC_32 ":--32} \
- %{" SPEC_64 ":--64} \
-diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
-index 3d044e8..5c89fca 100644
---- a/gcc/config/i386/i386.c
-+++ b/gcc/config/i386/i386.c
-@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp)
- else if (!SYMBOL_REF_FAR_ADDR_P (op0)
- && (SYMBOL_REF_LOCAL_P (op0)
- || (HAVE_LD_PIE_COPYRELOC
-+ && !TARGET_HAS_BIONIC
- && flag_pie
- && !SYMBOL_REF_WEAK (op0)
- && !SYMBOL_REF_FUNCTION_P (op0)))
-diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h
-index 4b9910f..c33e074 100644
---- a/gcc/config/i386/linux-common.h
-+++ b/gcc/config/i386/linux-common.h
-@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see
- #undef CC1_SPEC
- #define CC1_SPEC \
- LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
-- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
-+ GNU_USER_TARGET_CC1_SPEC \
-+ ANDROID_TARGET_CC1_SPEC \
-+ " " \
-+ ANDROID_CC1_SPEC("-fPIC"))
-+
-+#define CC1PLUS_SPEC \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
-
- #undef LINK_SPEC
- #define LINK_SPEC \
-diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h
-index a1f98d3..3725799 100644
---- a/gcc/config/i386/pmm_malloc.h
-+++ b/gcc/config/i386/pmm_malloc.h
-@@ -31,7 +31,7 @@
- #ifndef __cplusplus
- extern int posix_memalign (void **, size_t, size_t);
- #else
--extern "C" int posix_memalign (void **, size_t, size_t) throw ();
-+extern "C" int posix_memalign (void **, size_t, size_t);
- #endif
-
- static __inline void *
-diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h
-index 301a41c..bebfe7f 100644
---- a/gcc/config/linux-android.h
-+++ b/gcc/config/linux-android.h
-@@ -38,15 +38,18 @@
- "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}"
-
- #define ANDROID_LINK_SPEC \
-- "%{shared: -Bsymbolic}"
-+ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now"
-
--#define ANDROID_CC1_SPEC \
-+#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \
- "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \
-- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}"
-+ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}"
-
- #define ANDROID_CC1PLUS_SPEC \
-- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \
-- "%{!frtti:%{!fno-rtti: -fno-rtti}}"
-+ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \
-+ "%{!frtti:%{!fno-rtti: -frtti}}"
-+
-+#define ANDROID_ASM_SPEC \
-+ "--noexecstack"
-
- #define ANDROID_LIB_SPEC \
- "%{!static: -ldl}"
-diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h
-new file mode 100644
-index 0000000..32c539c
---- /dev/null
-+++ b/gcc/config/mips/android.h
-@@ -0,0 +1,49 @@
-+/* Target macros for mips*-*android* targets.
-+ Copyright (C) 2014 Free Software Foundation, Inc.
-+
-+This file is part of GCC.
-+
-+GCC is free software; you can redistribute it and/or modify
-+it under the terms of the GNU General Public License as published by
-+the Free Software Foundation; either version 3, or (at your option)
-+any later version.
-+
-+GCC is distributed in the hope that it will be useful,
-+but WITHOUT ANY WARRANTY; without even the implied warranty of
-+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+GNU General Public License for more details.
-+
-+You should have received a copy of the GNU General Public License
-+along with GCC; see the file COPYING3. If not see
-+<http://www.gnu.org/licenses/>. */
-+
-+#undef DRIVER_SELF_SPECS
-+#define DRIVER_SELF_SPECS \
-+ /* Make sure a -mips option is present. This helps us to pick \
-+ the right multilib, and also makes the later specs easier \
-+ to write. */ \
-+ MIPS_ISA_LEVEL_SPEC, \
-+ \
-+ /* Infer the default float setting from -march. */ \
-+ MIPS_ARCH_FLOAT_SPEC, \
-+ \
-+ /* Infer the -msynci setting from -march if not explicitly set. */ \
-+ MIPS_ISA_SYNCI_SPEC, \
-+ \
-+ /* If no ABI option is specified, infer one from the ISA level \
-+ or -mgp setting. */ \
-+ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \
-+ \
-+ /* If no FP ABI option is specified, infer one from the \
-+ ABI/ISA level unless there is a conflicting option. */ \
-+ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \
-+ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \
-+ \
-+ /* If no odd-spreg option is specified, infer one from the ISA. */ \
-+ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \
-+ \
-+ /* Base SPECs. */ \
-+ BASE_DRIVER_SELF_SPECS, \
-+ \
-+ /* Use the standard linux specs for everything else. */ \
-+ LINUX_DRIVER_SELF_SPECS
-diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h
-index 15b549c..4a28160 100644
---- a/gcc/config/mips/gnu-user.h
-+++ b/gcc/config/mips/gnu-user.h
-@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see
- /* The GNU C++ standard library requires this. */ \
- if (c_dialect_cxx ()) \
- builtin_define ("_GNU_SOURCE"); \
-+ ANDROID_TARGET_OS_CPP_BUILTINS(); \
- } while (0)
-
- #undef SUBTARGET_CPP_SPEC
-@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see
-
- #undef SUBTARGET_ASM_SPEC
- #define SUBTARGET_ASM_SPEC \
-- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}"
-+ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
-
- /* The MIPS assembler has different syntax for .set. We set it to
- .dummy to trap any errors. */
-@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
- #endif
-
- #define LINUX_DRIVER_SELF_SPECS \
-- NO_SHARED_SPECS \
-+ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \
- MARCH_MTUNE_NATIVE_SPECS, \
- /* -mplt has no effect without -mno-shared. Simplify later \
- specs handling by removing a redundant option. */ \
-diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
-index 8429a7c..4b0825d 100644
---- a/gcc/config/mips/linux-common.h
-+++ b/gcc/config/mips/linux-common.h
-@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see
- #undef SUBTARGET_CC1_SPEC
- #define SUBTARGET_CC1_SPEC \
- LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
-- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
-+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
-
- #undef CC1PLUS_SPEC
- #define CC1PLUS_SPEC \
-@@ -62,3 +62,7 @@ along with GCC; see the file COPYING3. If not see
-
- /* The default value isn't sufficient in 64-bit mode. */
- #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024)
-+
-+#ifdef IN_LIBGCC2
-+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
-+#endif
-diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android
-new file mode 100644
-index 0000000..39f512c
---- /dev/null
-+++ b/gcc/config/mips/t-linux-android
-@@ -0,0 +1,3 @@
-+MULTILIB_OPTIONS = mips32r2/mips32r6
-+MULTILIB_DIRNAMES = mips-r2 mips-r6
-+MULTILIB_OSDIRNAMES = ../libr2 ../libr6
-diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64
-new file mode 100644
-index 0000000..55cab7d
---- /dev/null
-+++ b/gcc/config/mips/t-linux-android64
-@@ -0,0 +1,4 @@
-+MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6
-+MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6
-+MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64
-+MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6
-diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h
-index 37ecfc4..a5f1b99 100644
---- a/gcc/config/openbsd.h
-+++ b/gcc/config/openbsd.h
-@@ -136,8 +136,12 @@ while (0)
- #define LIB_SPEC OBSD_LIB_SPEC
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-
- #undef LIB_SPEC
- #define LIB_SPEC OBSD_LIB_SPEC
-diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h
-index cbf9097..eb2217f 100644
---- a/gcc/config/rs6000/sysv4.h
-+++ b/gcc/config/rs6000/sysv4.h
-@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
- -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}"
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
--# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
-+# ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+# define LINK_EH_SPEC "--eh-frame-hdr "
-+# else
-+# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
-+# endif
- #endif
-
- #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \
-diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
-index 5160e1f..7632a50 100644
---- a/gcc/config/sol2.h
-+++ b/gcc/config/sol2.h
-@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see
- /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs
- --eh-frame-hdr to create the required .eh_frame_hdr sections. */
- #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
-+#endif
- #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */
- #endif
-
-diff --git a/gcc/configure b/gcc/configure
-index 1c6e340..28ad050 100755
---- a/gcc/configure
-+++ b/gcc/configure
-@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769
- enable_fix_cortex_a53_843419
- with_glibc_version
- enable_gnu_unique_object
-+enable_eh_frame_hdr_for_static
- enable_linker_build_id
- enable_default_ssp
- with_long_double_128
-@@ -1670,6 +1671,9 @@ Optional Features:
- --enable-gnu-unique-object
- enable the use of the @gnu_unique_object ELF
- extension on glibc systems
-+ --enable-eh-frame-hdr-for-static
-+ enable linker PT_GNU_EH_FRAME support for static
-+ executable
- --enable-linker-build-id
- compiler will always pass --build-id to linker
- --enable-default-ssp enable Stack Smashing Protection as default
-@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
-
- $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h
-
-+ # Check whether --enable-eh-frame-hdr-for-static was given.
-+if test "${enable_eh_frame_hdr_for_static+set}" = set; then :
-+ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in
-+ yes | no) ;;
-+ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid
-+value for --enable-eh-frame-hdr-for-static.
-+Valid choices are 'yes' and 'no'." "$LINENO" 5 ;;
-+ esac
-+else
-+ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
-+# Linux kernel.
-+ if test x$host = x$build -a x$host = x$target &&
-+ ldd --version 2>&1 >/dev/null &&
-+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
-+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
-+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
-+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
-+ if test "$glibcnum" -ge 2003 ; then
-+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
-+ if echo "$auvx" | grep AT_PHDR > /dev/null &&
-+ echo "$auvx" | grep AT_PHNUM > /dev/null; then
-+ enable_eh_frame_hdr_for_static=yes
-+ fi
-+ fi
-+ fi
-+fi
-+
-+ if test x$enable_eh_frame_hdr_for_static = xyes; then
-+
-+$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h
-+
-+ fi
- fi
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5
- $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; }
-diff --git a/gcc/configure.ac b/gcc/configure.ac
-index 6c1dcd9..0cf7419 100644
---- a/gcc/configure.ac
-+++ b/gcc/configure.ac
-@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR])
- if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
- AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1,
- [Define if your linker supports .eh_frame_hdr.])
-+ AC_ARG_ENABLE(eh-frame-hdr-for-static,
-+ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static],
-+ [enable linker PT_GNU_EH_FRAME support for static executable])],
-+ [case $enable_eh_frame_hdr_for_static in
-+ yes | no) ;;
-+ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid
-+value for --enable-eh-frame-hdr-for-static.
-+Valid choices are 'yes' and 'no'.]) ;;
-+ esac],
-+# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
-+# Linux kernel.
-+ [[if test x$host = x$build -a x$host = x$target &&
-+ ldd --version 2>&1 >/dev/null &&
-+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
-+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
-+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
-+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
-+ if test "$glibcnum" -ge 2003 ; then
-+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
-+ if echo "$auvx" | grep AT_PHDR > /dev/null &&
-+ echo "$auvx" | grep AT_PHNUM > /dev/null; then
-+ enable_eh_frame_hdr_for_static=yes
-+ fi
-+ fi
-+ fi]])
-+ if test x$enable_eh_frame_hdr_for_static = xyes; then
-+ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1,
-+[Define if your system supports PT_GNU_EH_FRAME for static executable.])
-+ fi
- fi
- AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr)
-
-diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C
-new file mode 100644
-index 0000000..15408ef
---- /dev/null
-+++ b/gcc/testsuite/g++.dg/eh/spec3-static.C
-@@ -0,0 +1,25 @@
-+// PR c++/4381
-+// Test that exception-specs work properly for classes with virtual bases.
-+
-+// { dg-do run }
-+// { dg-options "-static" }
-+
-+class Base {};
-+
-+struct A : virtual public Base
-+{
-+ A() {}
-+};
-+
-+struct B {};
-+
-+void func() throw (B,A)
-+{
-+ throw A();
-+}
-+
-+int main(void)
-+{
-+ try { func(); }
-+ catch (A& a) { }
-+}
-diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c
-index f3343fc..d426477 100644
---- a/libgcc/crtstuff.c
-+++ b/libgcc/crtstuff.c
-@@ -88,7 +88,8 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) \
-- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
-+ && !defined(inhibit_libc) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(BSD_DL_ITERATE_PHDR_AVAILABLE)
- #include <link.h>
- # define USE_PT_GNU_EH_FRAME
-@@ -97,7 +98,8 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \
-- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
-+ && !defined(inhibit_libc) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(__sun__) && defined(__svr4__)
- #include <link.h>
- # define USE_PT_GNU_EH_FRAME
-@@ -106,7 +108,8 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) \
-- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
-+ && !defined(inhibit_libc) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(__GLIBC__) && __GLIBC__ >= 2
- #include <link.h>
- /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
-@@ -121,7 +124,7 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) \
-- && !defined(CRTSTUFFT_O) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(inhibit_libc) \
- && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__))
- /* On systems using glibc, an inhibit_libc build of libgcc is only
-diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
-index 555c0fe..47c8655 100644
---- a/libgcc/gthr-posix.h
-+++ b/libgcc/gthr-posix.h
-@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define __GTHREADS 1
- #define __GTHREADS_CXX0X 1
-
-+/* The following should normally be in a different header file,
-+ * but I couldn't find the right location. The point of the macro
-+ * definition below is to prevent libsupc++ and libstdc++ to reference
-+ * weak symbols in their static C++ constructors. Such code crashes
-+ * when a shared object linked statically to these libraries is
-+ * loaded on Android 2.1 (Eclair) and older platform releases, due
-+ * to a dynamic linker bug.
-+ */
-+#ifdef __ANDROID__
-+#undef GTHREAD_USE_WEAK
-+#define GTHREAD_USE_WEAK 0
-+#endif
-+
- #include <pthread.h>
-
- #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \
-diff --git a/libgcc/gthr.h b/libgcc/gthr.h
-index 47a7d06..67a680f 100644
---- a/libgcc/gthr.h
-+++ b/libgcc/gthr.h
-@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define GTHREAD_USE_WEAK 1
- #endif
- #endif
-+#if __ANDROID__
-+#include "gthr-posix.h"
-+#else
- #include "gthr-default.h"
-+#endif
-
- #ifndef HIDE_EXPORTS
- #pragma GCC visibility pop
-diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
-index 41797a9..5a631dd 100755
---- a/libstdc++-v3/configure
-+++ b/libstdc++-v3/configure
-@@ -78319,6 +78319,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
- /* end confdefs.h. */
- #include <sys/syscall.h>
- int lk;
-+#if !defined(SYS_gettid)
-+#define SYS_gettid __NR_gettid
-+#endif
-+#if !defined(SYS_futex)
-+#define SYS_futex __NR_futex
-+#endif
- int
- main ()
- {
-@@ -78377,6 +78383,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
- /* end confdefs.h. */
- #include <sys/syscall.h>
- int lk;
-+#if !defined(SYS_gettid)
-+#define SYS_gettid __NR_gettid
-+#endif
-+#if !defined(SYS_futex)
-+#define SYS_futex __NR_futex
-+#endif
- int
- main ()
- {
-diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h
-index e3e206b..e85dc2c 100644
---- a/libstdc++-v3/include/bits/locale_facets.h
-+++ b/libstdc++-v3/include/bits/locale_facets.h
-@@ -47,6 +47,20 @@
- #include <ext/numeric_traits.h>
- #include <bits/streambuf_iterator.h>
-
-+#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__
-+// CrystaX: for some reason, x86 gcc-4.9 makes ctype<char>::do_widen() and
-+// ctype<char>::_M_widen_init() methods working wrong if optimization enabled.
-+// For ctype<char>::do_widen(), values of passed arguments (__lo, __hi and __to)
-+// are completely messed up and don't correspond to passed values. In case if
-+// we disable optimization for those methods, things become correct so we apply
-+// this workaround here for a time.
-+// TODO: figure out what exactly wrong here - is it bug in GCC optimization
-+// algorithm or smth else?
-+#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0)))
-+#else
-+#define __CRYSTAX_X86_DONT_OPTIMIZE
-+#endif
-+
- namespace std _GLIBCXX_VISIBILITY(default)
- {
- _GLIBCXX_BEGIN_NAMESPACE_VERSION
-@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
- * @return @a __hi.
- */
- virtual const char*
-- do_widen(const char* __lo, const char* __hi, char_type* __to) const
-+ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE
- {
- __builtin_memcpy(__to, __lo, __hi - __lo);
- return __hi;
-@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
-
- private:
- void _M_narrow_init() const;
-- void _M_widen_init() const;
-+ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE;
- };
-
- #ifdef _GLIBCXX_USE_WCHAR_T
-diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc
-index 9b61799..c149169 100644
---- a/libstdc++-v3/libsupc++/guard.cc
-+++ b/libstdc++-v3/libsupc++/guard.cc
-@@ -33,7 +33,12 @@
- #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \
- && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX)
- # include <climits>
-+#if defined(__ANDROID__)
-+# include <sys/syscall.h>
-+# define SYS_futex __NR_futex
-+#else
- # include <syscall.h>
-+#endif
- # include <unistd.h>
- # define _GLIBCXX_USE_FUTEX
- # define _GLIBCXX_FUTEX_WAIT 0
diff --git a/patches/gcc/6.3.0/970-crystax.patch b/patches/gcc/6.3.0/970-crystax.patch
new file mode 100644
index 0000000..e3109cc
--- /dev/null
+++ b/patches/gcc/6.3.0/970-crystax.patch
@@ -0,0 +1,551 @@
+commit 080803512c8f6f87c2f1f711170d54033144d628
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Wed Jul 29 11:28:29 2015 +0300
+
+ [android] Apply Android-related modifications
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+[Edited: keep libstdc++, drop libcrystax-related modifications]
+diff --git a/gcc/config.gcc b/gcc/config.gcc
+index f66e48cd1..1c253496b 100644
+--- a/gcc/config.gcc
++++ b/gcc/config.gcc
+@@ -942,13 +942,17 @@ aarch64*-*-elf | aarch64*-*-rtems*)
+ TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
+ ;;
+ aarch64*-*-linux*)
+- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h"
+ tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
++ extra_options="${extra_options} linux-android.opt"
+ tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
+ case $target in
+ aarch64_be-*)
+ tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"
+ ;;
++ aarch64*-*-linux-android*)
++ tm_file="${tm_file} aarch64/aarch64-linux-android.h"
++ ;;
+ esac
+ aarch64_multilibs="${with_multilib_list}"
+ if test "$aarch64_multilibs" = "default"; then
+@@ -2055,6 +2059,17 @@ mips*-*-linux*) # Linux MIPS, either endian.
+ tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h"
+ extra_options="${extra_options} linux-android.opt"
+ case ${target} in
++ mips64*android*)
++ default_mips_arch=mips64r6
++ default_mips_abi=64
++ tm_file="${tm_file} mips/android.h"
++ tmake_file="${tmake_file} mips/t-linux-android64"
++ ;;
++ mips*android*)
++ default_mips_arch=mips32
++ tm_file="${tm_file} mips/android.h"
++ tmake_file="$tmake_file mips/t-linux-android"
++ ;;
+ mipsisa32r6*)
+ default_mips_arch=mips32r6
+ ;;
+diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
+new file mode 100644
+index 000000000..db1288fd0
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-linux-android.h
+@@ -0,0 +1,59 @@
++/* Machine description for AArch64 architecture.
++ Copyright (C) 2014 Free Software Foundation, Inc.
++
++ This file is part of GCC.
++
++ GCC is free software; you can redistribute it and/or modify it
++ under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 3, or (at your option)
++ any later version.
++
++ GCC is distributed in the hope that it will be useful, but
++ WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with GCC; see the file COPYING3. If not see
++ <http://www.gnu.org/licenses/>. */
++
++#ifndef GCC_AARCH64_LINUX_ANDROID_H
++#define GCC_AARCH64_LINUX_ANDROID_H
++
++
++#undef TARGET_OS_CPP_BUILTINS
++#define TARGET_OS_CPP_BUILTINS() \
++ do \
++ { \
++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \
++ ANDROID_TARGET_OS_CPP_BUILTINS(); \
++ } \
++ while (0)
++
++#undef LINK_SPEC
++#define LINK_SPEC \
++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \
++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC)
++
++#undef CC1_SPEC
++#define CC1_SPEC \
++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
++
++#define CC1PLUS_SPEC \
++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
++
++#undef LIB_SPEC
++#define LIB_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC)
++
++#undef STARTFILE_SPEC
++#define STARTFILE_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC)
++
++#undef ENDFILE_SPEC
++#define ENDFILE_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
++
++#endif /* GCC_AARCH64_LINUX_ANDROID_H */
+diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
+index 5fcaa59a3..6864195ee 100644
+--- a/gcc/config/aarch64/aarch64-linux.h
++++ b/gcc/config/aarch64/aarch64-linux.h
+@@ -21,7 +21,14 @@
+ #ifndef GCC_AARCH64_LINUX_H
+ #define GCC_AARCH64_LINUX_H
+
+-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
++#ifndef RUNTIME_ROOT_PREFIX
++#define RUNTIME_ROOT_PREFIX ""
++#endif
++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
++#ifdef BIONIC_DYNAMIC_LINKER
++#undef BIONIC_DYNAMIC_LINKER
++#endif
++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64"
+
+ #undef MUSL_DYNAMIC_LINKER
+ #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
+diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
+index ad123dde9..97b059de6 100644
+--- a/gcc/config/arm/arm.h
++++ b/gcc/config/arm/arm.h
+@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes
+
+ #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \
+ || (TARGET_THUMB1 \
++ && !inline_thumb1_jump_table \
+ && (optimize_size || flag_pic)))
+
+ #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \
+- (TARGET_THUMB1 \
++ (TARGET_THUMB1 && !inline_thumb1_jump_table \
+ ? (min >= 0 && max < 512 \
+ ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \
+ : min >= -256 && max < 256 \
+diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
+index 47171b996..eb22d1181 100644
+--- a/gcc/config/arm/arm.md
++++ b/gcc/config/arm/arm.md
+@@ -8179,7 +8179,7 @@
+ (match_operand:SI 2 "const_int_operand" "") ; total range
+ (match_operand:SI 3 "" "") ; table label
+ (match_operand:SI 4 "" "")] ; Out of range label
+- "TARGET_32BIT || optimize_size || flag_pic"
++ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)"
+ "
+ {
+ enum insn_code code;
+diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
+index 0ebe01743..772453889 100644
+--- a/gcc/config/arm/arm.opt
++++ b/gcc/config/arm/arm.opt
+@@ -193,6 +193,10 @@ mthumb-interwork
+ Target Report Mask(INTERWORK)
+ Support calls between Thumb and ARM instruction sets.
+
++minline-thumb1-jumptable
++Target Report Var(inline_thumb1_jump_table)
++Inline Thumb1 Jump table code
++
+ mtls-dialect=
+ Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU)
+ Specify thread local storage scheme.
+diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h
+index 77f30554d..32158ed65 100644
+--- a/gcc/config/arm/elf.h
++++ b/gcc/config/arm/elf.h
+@@ -56,8 +56,7 @@
+ #undef SUBSUBTARGET_EXTRA_SPECS
+ #define SUBSUBTARGET_EXTRA_SPECS
+
+-#ifndef ASM_SPEC
+-#define ASM_SPEC "\
++#define DEFAULT_ASM_SPEC "\
+ %{mbig-endian:-EB} \
+ %{mlittle-endian:-EL} \
+ %(asm_cpu_spec) \
+@@ -66,6 +65,9 @@
+ %{mthumb-interwork:-mthumb-interwork} \
+ %{mfloat-abi=*} %{mfpu=*} \
+ %(subtarget_extra_asm_spec)"
++
++#ifndef ASM_SPEC
++#define ASM_SPEC DEFAULT_ASM_SPEC
+ #endif
+
+ /* The ARM uses @ are a comment character so we need to redefine
+@@ -104,7 +106,8 @@
+ the code more efficient, but for Thumb-1 it's better to put them out of
+ band unless we are generating compressed tables. */
+ #define JUMP_TABLES_IN_TEXT_SECTION \
+- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic)))
++ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \
++ && (optimize_size || flag_pic)))
+
+ #ifndef LINK_SPEC
+ #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X"
+diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
+index ace84816e..8c8fa6553 100644
+--- a/gcc/config/arm/linux-eabi.h
++++ b/gcc/config/arm/linux-eabi.h
+@@ -108,11 +108,16 @@
+ #define CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \
+ GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \
+- ANDROID_CC1_SPEC)
++ ANDROID_CC1_SPEC("-fpic"))
+
+ #define CC1PLUS_SPEC \
+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
+
++#undef ASM_SPEC
++#define ASM_SPEC \
++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \
++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC)
++
+ #undef LIB_SPEC
+ #define LIB_SPEC \
+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
+diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h
+index fee33a3ef..22fb2ced9 100644
+--- a/gcc/config/i386/gnu-user.h
++++ b/gcc/config/i386/gnu-user.h
+@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see
+ When the -shared link option is used a final link is not being
+ done. */
+
++#undef ANDROID_TARGET_CC1_SPEC
++#define ANDROID_TARGET_CC1_SPEC \
++ " -mssse3 -fno-short-enums " \
++
+ #undef ASM_SPEC
+ #define ASM_SPEC \
+- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \
++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
+
+ #undef SUBTARGET_EXTRA_SPECS
+ #define SUBTARGET_EXTRA_SPECS \
+diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h
+index 7a02a7eb4..cac4179bc 100644
+--- a/gcc/config/i386/gnu-user64.h
++++ b/gcc/config/i386/gnu-user64.h
+@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define SPEC_X32 "mx32"
+ #endif
+
++#undef ANDROID_TARGET_CC1_SPEC
++#define ANDROID_TARGET_CC1_SPEC \
++ "%{m32:-mssse3 -fno-short-enums}" \
++ "%{!m32:-msse4.2 -mpopcnt}"
++
+ #undef ASM_SPEC
+ #define ASM_SPEC "%{" SPEC_32 ":--32} \
+ %{" SPEC_64 ":--64} \
+diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h
+index 4b9910fa9..3b11ed086 100644
+--- a/gcc/config/i386/linux-common.h
++++ b/gcc/config/i386/linux-common.h
+@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see
+ #undef CC1_SPEC
+ #define CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
+- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
++ GNU_USER_TARGET_CC1_SPEC \
++ ANDROID_TARGET_CC1_SPEC \
++ " " \
++ ANDROID_CC1_SPEC("-fPIC"))
++
++#define CC1PLUS_SPEC \
++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
+
+ #undef LINK_SPEC
+ #define LINK_SPEC \
+diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h
+index 301a41ccd..9623c88d0 100644
+--- a/gcc/config/linux-android.h
++++ b/gcc/config/linux-android.h
+@@ -38,15 +39,18 @@
+ "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}"
+
+ #define ANDROID_LINK_SPEC \
+- "%{shared: -Bsymbolic}"
++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now"
+
+-#define ANDROID_CC1_SPEC \
++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \
+ "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \
+- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}"
++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}"
+
+ #define ANDROID_CC1PLUS_SPEC \
+- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \
+- "%{!frtti:%{!fno-rtti: -fno-rtti}}"
++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \
++ "%{!frtti:%{!fno-rtti: -frtti}}"
++
++#define ANDROID_ASM_SPEC \
++ "--noexecstack"
+
+ #define ANDROID_LIB_SPEC \
+ "%{!static: -ldl}"
+diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h
+new file mode 100644
+index 000000000..32c539c8d
+--- /dev/null
++++ b/gcc/config/mips/android.h
+@@ -0,0 +1,49 @@
++/* Target macros for mips*-*android* targets.
++ Copyright (C) 2014 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3. If not see
++<http://www.gnu.org/licenses/>. */
++
++#undef DRIVER_SELF_SPECS
++#define DRIVER_SELF_SPECS \
++ /* Make sure a -mips option is present. This helps us to pick \
++ the right multilib, and also makes the later specs easier \
++ to write. */ \
++ MIPS_ISA_LEVEL_SPEC, \
++ \
++ /* Infer the default float setting from -march. */ \
++ MIPS_ARCH_FLOAT_SPEC, \
++ \
++ /* Infer the -msynci setting from -march if not explicitly set. */ \
++ MIPS_ISA_SYNCI_SPEC, \
++ \
++ /* If no ABI option is specified, infer one from the ISA level \
++ or -mgp setting. */ \
++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \
++ \
++ /* If no FP ABI option is specified, infer one from the \
++ ABI/ISA level unless there is a conflicting option. */ \
++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \
++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \
++ \
++ /* If no odd-spreg option is specified, infer one from the ISA. */ \
++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \
++ \
++ /* Base SPECs. */ \
++ BASE_DRIVER_SELF_SPECS, \
++ \
++ /* Use the standard linux specs for everything else. */ \
++ LINUX_DRIVER_SELF_SPECS
+diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h
+index 15b549c08..4a2816014 100644
+--- a/gcc/config/mips/gnu-user.h
++++ b/gcc/config/mips/gnu-user.h
+@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see
+ /* The GNU C++ standard library requires this. */ \
+ if (c_dialect_cxx ()) \
+ builtin_define ("_GNU_SOURCE"); \
++ ANDROID_TARGET_OS_CPP_BUILTINS(); \
+ } while (0)
+
+ #undef SUBTARGET_CPP_SPEC
+@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see
+
+ #undef SUBTARGET_ASM_SPEC
+ #define SUBTARGET_ASM_SPEC \
+- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}"
++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \
++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
+
+ /* The MIPS assembler has different syntax for .set. We set it to
+ .dummy to trap any errors. */
+@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
+ #endif
+
+ #define LINUX_DRIVER_SELF_SPECS \
+- NO_SHARED_SPECS \
++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \
+ MARCH_MTUNE_NATIVE_SPECS, \
+ /* -mplt has no effect without -mno-shared. Simplify later \
+ specs handling by removing a redundant option. */ \
+diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
+index 8429a7ca2..8bfacf994 100644
+--- a/gcc/config/mips/linux-common.h
++++ b/gcc/config/mips/linux-common.h
+@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see
+ #undef SUBTARGET_CC1_SPEC
+ #define SUBTARGET_CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
+- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
+
+ #undef CC1PLUS_SPEC
+ #define CC1PLUS_SPEC \
+diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android
+new file mode 100644
+index 000000000..39f512c81
+--- /dev/null
++++ b/gcc/config/mips/t-linux-android
+@@ -0,0 +1,3 @@
++MULTILIB_OPTIONS = mips32r2/mips32r6
++MULTILIB_DIRNAMES = mips-r2 mips-r6
++MULTILIB_OSDIRNAMES = ../libr2 ../libr6
+diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64
+new file mode 100644
+index 000000000..55cab7d62
+--- /dev/null
++++ b/gcc/config/mips/t-linux-android64
+@@ -0,0 +1,4 @@
++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6
++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6
++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64
++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6
+diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
+index 555c0fe24..47c8655f9 100644
+--- a/libgcc/gthr-posix.h
++++ b/libgcc/gthr-posix.h
+@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define __GTHREADS 1
+ #define __GTHREADS_CXX0X 1
+
++/* The following should normally be in a different header file,
++ * but I couldn't find the right location. The point of the macro
++ * definition below is to prevent libsupc++ and libstdc++ to reference
++ * weak symbols in their static C++ constructors. Such code crashes
++ * when a shared object linked statically to these libraries is
++ * loaded on Android 2.1 (Eclair) and older platform releases, due
++ * to a dynamic linker bug.
++ */
++#ifdef __ANDROID__
++#undef GTHREAD_USE_WEAK
++#define GTHREAD_USE_WEAK 0
++#endif
++
+ #include <pthread.h>
+
+ #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \
+diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
+index 41797a971..f746e8353 100755
+--- a/libstdc++-v3/configure
++++ b/libstdc++-v3/configure
+@@ -78319,6 +78341,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ /* end confdefs.h. */
+ #include <sys/syscall.h>
+ int lk;
++#if !defined(SYS_gettid)
++#define SYS_gettid __NR_gettid
++#endif
++#if !defined(SYS_futex)
++#define SYS_futex __NR_futex
++#endif
+ int
+ main ()
+ {
+@@ -78377,6 +78405,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ /* end confdefs.h. */
+ #include <sys/syscall.h>
+ int lk;
++#if !defined(SYS_gettid)
++#define SYS_gettid __NR_gettid
++#endif
++#if !defined(SYS_futex)
++#define SYS_futex __NR_futex
++#endif
+ int
+ main ()
+ {
+diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h
+index e3e206b7d..e85dc2c76 100644
+--- a/libstdc++-v3/include/bits/locale_facets.h
++++ b/libstdc++-v3/include/bits/locale_facets.h
+@@ -47,6 +47,20 @@
+ #include <ext/numeric_traits.h>
+ #include <bits/streambuf_iterator.h>
+
++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__
++// CrystaX: for some reason, x86 gcc-4.9 makes ctype<char>::do_widen() and
++// ctype<char>::_M_widen_init() methods working wrong if optimization enabled.
++// For ctype<char>::do_widen(), values of passed arguments (__lo, __hi and __to)
++// are completely messed up and don't correspond to passed values. In case if
++// we disable optimization for those methods, things become correct so we apply
++// this workaround here for a time.
++// TODO: figure out what exactly wrong here - is it bug in GCC optimization
++// algorithm or smth else?
++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0)))
++#else
++#define __CRYSTAX_X86_DONT_OPTIMIZE
++#endif
++
+ namespace std _GLIBCXX_VISIBILITY(default)
+ {
+ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+ * @return @a __hi.
+ */
+ virtual const char*
+- do_widen(const char* __lo, const char* __hi, char_type* __to) const
++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE
+ {
+ __builtin_memcpy(__to, __lo, __hi - __lo);
+ return __hi;
+@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+
+ private:
+ void _M_narrow_init() const;
+- void _M_widen_init() const;
++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE;
+ };
+
+ #ifdef _GLIBCXX_USE_WCHAR_T
+diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc
+index 9b617998f..c149169bb 100644
+--- a/libstdc++-v3/libsupc++/guard.cc
++++ b/libstdc++-v3/libsupc++/guard.cc
+@@ -33,7 +33,12 @@
+ #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \
+ && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX)
+ # include <climits>
++#if defined(__ANDROID__)
++# include <sys/syscall.h>
++# define SYS_futex __NR_futex
++#else
+ # include <syscall.h>
++#endif
+ # include <unistd.h>
+ # define _GLIBCXX_USE_FUTEX
+ # define _GLIBCXX_FUTEX_WAIT 0
diff --git a/patches/gcc/6.3.0/971-crystax.patch b/patches/gcc/6.3.0/971-crystax.patch
new file mode 100644
index 0000000..748a381
--- /dev/null
+++ b/patches/gcc/6.3.0/971-crystax.patch
@@ -0,0 +1,25 @@
+commit 9f057b62caafe08c968103d39b5df82486a175c2
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Thu Aug 13 16:11:54 2015 +0300
+
+ [android] Add additional multilib option: mfloat-abi=hard
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi
+index 8f1307c55..cbbec5bd2 100644
+--- a/gcc/config/arm/t-linux-androideabi
++++ b/gcc/config/arm/t-linux-androideabi
+@@ -1,8 +1,9 @@
+-MULTILIB_OPTIONS = march=armv7-a mthumb
+-MULTILIB_DIRNAMES = armv7-a thumb
+-MULTILIB_EXCEPTIONS =
++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard
++MULTILIB_DIRNAMES = armv7-a thumb hard
++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard*
+ MULTILIB_MATCHES =
+ MULTILIB_OSDIRNAMES =
++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch
+
+ # The "special" multilib can be used to build native applications for Android,
+ # as opposed to native shared libraries that are then called via JNI.
diff --git a/patches/gcc/6.3.0/972-crystax.patch b/patches/gcc/6.3.0/972-crystax.patch
new file mode 100644
index 0000000..b9077be
--- /dev/null
+++ b/patches/gcc/6.3.0/972-crystax.patch
@@ -0,0 +1,302 @@
+commit 44a81ebb7698dac41ffa7acd5e0cc1578e5ab1fd
+Author: H.J. Lu <hongjiu.lu@intel.com>
+Date: Mon Apr 14 15:59:47 2014 -0700
+
+ [android] Always enable --eh-frame-hdr for static executable
+
+ See 5e6cdf76af295c9a39b695ca228cff675e8ff4ae and
+ 23e3137ee2897464b051599b85a09f130d3ad05d
+
+ Change-Id: Ibda473188e5a10f2a0592f2494ad00ad1f91e04b
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config.in b/gcc/config.in
+index 115cb6163..933916833 100644
+--- a/gcc/config.in
++++ b/gcc/config.in
+@@ -2119,6 +2119,12 @@
+ #endif
+
+
++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */
++#ifndef USED_FOR_TARGET
++#undef USE_EH_FRAME_HDR_FOR_STATIC
++#endif
++
++
+ /* Define to 1 if the 'long long' type is wider than 'long' but still
+ efficiently supported by the host hardware. */
+ #ifndef USED_FOR_TARGET
+diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h
+index 093c38bba..54b3e0c91 100644
+--- a/gcc/config/alpha/elf.h
++++ b/gcc/config/alpha/elf.h
+@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number;
+ I imagine that other systems will catch up. In the meantime, it
+ doesn't harm to make sure that the data exists to be used later. */
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h
+index 5ded869d2..5f51ac81d 100644
+--- a/gcc/config/freebsd.h
++++ b/gcc/config/freebsd.h
+@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see
+ #define LIB_SPEC FBSD_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #ifdef TARGET_LIBC_PROVIDES_SSP
+ #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \
+diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h
+index b0bf40a95..d1874bc29 100644
+--- a/gcc/config/gnu-user.h
++++ b/gcc/config/gnu-user.h
+@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #undef LINK_GCC_C_SEQUENCE_SPEC
+ #define LINK_GCC_C_SEQUENCE_SPEC \
+diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h
+index 37ecfc43f..a5f1b9955 100644
+--- a/gcc/config/openbsd.h
++++ b/gcc/config/openbsd.h
+@@ -136,8 +136,12 @@ while (0)
+ #define LIB_SPEC OBSD_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #undef LIB_SPEC
+ #define LIB_SPEC OBSD_LIB_SPEC
+diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h
+index cbf909722..eb2217fad 100644
+--- a/gcc/config/rs6000/sysv4.h
++++ b/gcc/config/rs6000/sysv4.h
+@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
+ -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}"
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
+-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++# ifdef USE_EH_FRAME_HDR_FOR_STATIC
++# define LINK_EH_SPEC "--eh-frame-hdr "
++# else
++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++# endif
+ #endif
+
+ #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \
+diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
+index 5160e1fda..7632a5081 100644
+--- a/gcc/config/sol2.h
++++ b/gcc/config/sol2.h
+@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see
+ /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs
+ --eh-frame-hdr to create the required .eh_frame_hdr sections. */
+ #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++#endif
+ #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */
+ #endif
+
+diff --git a/gcc/configure b/gcc/configure
+index 1c6e3407c..28ad05004 100755
+--- a/gcc/configure
++++ b/gcc/configure
+@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769
+ enable_fix_cortex_a53_843419
+ with_glibc_version
+ enable_gnu_unique_object
++enable_eh_frame_hdr_for_static
+ enable_linker_build_id
+ enable_default_ssp
+ with_long_double_128
+@@ -1670,6 +1671,9 @@ Optional Features:
+ --enable-gnu-unique-object
+ enable the use of the @gnu_unique_object ELF
+ extension on glibc systems
++ --enable-eh-frame-hdr-for-static
++ enable linker PT_GNU_EH_FRAME support for static
++ executable
+ --enable-linker-build-id
+ compiler will always pass --build-id to linker
+ --enable-default-ssp enable Stack Smashing Protection as default
+@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
+
+ $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h
+
++ # Check whether --enable-eh-frame-hdr-for-static was given.
++if test "${enable_eh_frame_hdr_for_static+set}" = set; then :
++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in
++ yes | no) ;;
++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid
++value for --enable-eh-frame-hdr-for-static.
++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;;
++ esac
++else
++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
++# Linux kernel.
++ if test x$host = x$build -a x$host = x$target &&
++ ldd --version 2>&1 >/dev/null &&
++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
++ if test "$glibcnum" -ge 2003 ; then
++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
++ if echo "$auvx" | grep AT_PHDR > /dev/null &&
++ echo "$auvx" | grep AT_PHNUM > /dev/null; then
++ enable_eh_frame_hdr_for_static=yes
++ fi
++ fi
++ fi
++fi
++
++ if test x$enable_eh_frame_hdr_for_static = xyes; then
++
++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h
++
++ fi
+ fi
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5
+ $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; }
+diff --git a/gcc/configure.ac b/gcc/configure.ac
+index 6c1dcd9ae..0cf7419e7 100644
+--- a/gcc/configure.ac
++++ b/gcc/configure.ac
+@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR])
+ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
+ AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1,
+ [Define if your linker supports .eh_frame_hdr.])
++ AC_ARG_ENABLE(eh-frame-hdr-for-static,
++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static],
++ [enable linker PT_GNU_EH_FRAME support for static executable])],
++ [case $enable_eh_frame_hdr_for_static in
++ yes | no) ;;
++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid
++value for --enable-eh-frame-hdr-for-static.
++Valid choices are 'yes' and 'no'.]) ;;
++ esac],
++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
++# Linux kernel.
++ [[if test x$host = x$build -a x$host = x$target &&
++ ldd --version 2>&1 >/dev/null &&
++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
++ if test "$glibcnum" -ge 2003 ; then
++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
++ if echo "$auvx" | grep AT_PHDR > /dev/null &&
++ echo "$auvx" | grep AT_PHNUM > /dev/null; then
++ enable_eh_frame_hdr_for_static=yes
++ fi
++ fi
++ fi]])
++ if test x$enable_eh_frame_hdr_for_static = xyes; then
++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1,
++[Define if your system supports PT_GNU_EH_FRAME for static executable.])
++ fi
+ fi
+ AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr)
+
+diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C
+new file mode 100644
+index 000000000..15408effa
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C
+@@ -0,0 +1,25 @@
++// PR c++/4381
++// Test that exception-specs work properly for classes with virtual bases.
++
++// { dg-do run }
++// { dg-options "-static" }
++
++class Base {};
++
++struct A : virtual public Base
++{
++ A() {}
++};
++
++struct B {};
++
++void func() throw (B,A)
++{
++ throw A();
++}
++
++int main(void)
++{
++ try { func(); }
++ catch (A& a) { }
++}
+diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c
+index f3343fc4f..d42647779 100644
+--- a/libgcc/crtstuff.c
++++ b/libgcc/crtstuff.c
+@@ -88,7 +88,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(BSD_DL_ITERATE_PHDR_AVAILABLE)
+ #include <link.h>
+ # define USE_PT_GNU_EH_FRAME
+@@ -97,7 +98,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(__sun__) && defined(__svr4__)
+ #include <link.h>
+ # define USE_PT_GNU_EH_FRAME
+@@ -106,7 +108,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(__GLIBC__) && __GLIBC__ >= 2
+ #include <link.h>
+ /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
+@@ -121,7 +124,7 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(CRTSTUFFT_O) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(inhibit_libc) \
+ && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__))
+ /* On systems using glibc, an inhibit_libc build of libgcc is only
diff --git a/patches/gcc/6.3.0/973-crystax.patch b/patches/gcc/6.3.0/973-crystax.patch
new file mode 100644
index 0000000..b96ece3
--- /dev/null
+++ b/patches/gcc/6.3.0/973-crystax.patch
@@ -0,0 +1,20 @@
+commit 778a9ef107f51544d583f110e92b75f4d9d79117
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Thu Aug 20 19:11:07 2015 +0300
+
+ [android] Don't use PIE copyrelocs for x86/x86_64
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
+index 3d044e8bd..5c89fcab0 100644
+--- a/gcc/config/i386/i386.c
++++ b/gcc/config/i386/i386.c
+@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp)
+ else if (!SYMBOL_REF_FAR_ADDR_P (op0)
+ && (SYMBOL_REF_LOCAL_P (op0)
+ || (HAVE_LD_PIE_COPYRELOC
++ && !TARGET_HAS_BIONIC
+ && flag_pie
+ && !SYMBOL_REF_WEAK (op0)
+ && !SYMBOL_REF_FUNCTION_P (op0)))
diff --git a/patches/gcc/6.3.0/974-crystax.patch b/patches/gcc/6.3.0/974-crystax.patch
new file mode 100644
index 0000000..9db4f54
--- /dev/null
+++ b/patches/gcc/6.3.0/974-crystax.patch
@@ -0,0 +1,24 @@
+commit dbeae1190cabad83999f2540523f045acc1bb4ec
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Fri Aug 21 17:41:59 2015 +0300
+
+ [android] Always use gthr-posix.h instead of gthr-default.h
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/libgcc/gthr.h b/libgcc/gthr.h
+index 47a7d061a..67a680f90 100644
+--- a/libgcc/gthr.h
++++ b/libgcc/gthr.h
+@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define GTHREAD_USE_WEAK 1
+ #endif
+ #endif
++#if __ANDROID__
++#include "gthr-posix.h"
++#else
+ #include "gthr-default.h"
++#endif
+
+ #ifndef HIDE_EXPORTS
+ #pragma GCC visibility pop
diff --git a/patches/gcc/6.3.0/975-crystax.patch b/patches/gcc/6.3.0/975-crystax.patch
new file mode 100644
index 0000000..9efc2a4
--- /dev/null
+++ b/patches/gcc/6.3.0/975-crystax.patch
@@ -0,0 +1,31 @@
+commit 8a66d422721ae5999737d7825701ff22097d287b
+Author: Andrew Hsieh <andrewhsieh@google.com>
+Date: Mon Apr 14 21:05:51 2014 -0700
+
+ [android] Fix ARM generates insufficient alignment for NEON vst/vld
+
+ See d909af3e2469aad87d5c3e79b93c778fd26c03a9
+
+ Change-Id: Ie1de9f946f397196bb6f1623f5add86933739484
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
+index 5974c65d3..71b2c7aa9 100644
+--- a/gcc/config/arm/arm.c
++++ b/gcc/config/arm/arm.c
+@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code)
+ memsize = MEM_SIZE (x);
+
+ /* Only certain alignment specifiers are supported by the hardware. */
+- if (memsize == 32 && (align % 32) == 0)
++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC
++ honors stricter alignment of composite type in user code, it doesn't
++ observe the alignment of memory passed as an extra argument for function
++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */
++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED)
+ align_bits = 256;
+- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED)
+ align_bits = 128;
+ else if (memsize >= 8 && (align % 8) == 0)
+ align_bits = 64;
diff --git a/patches/gcc/6.3.0/976-crystax.patch b/patches/gcc/6.3.0/976-crystax.patch
new file mode 100644
index 0000000..790d4a9
--- /dev/null
+++ b/patches/gcc/6.3.0/976-crystax.patch
@@ -0,0 +1,21 @@
+commit 89d27bc45ee7325dcfff6748da0f8b9c1dc1f234
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Sat Aug 22 09:55:55 2015 +0300
+
+ [android][i386] Remove throw() declaration from posix_memalign() proto
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h
+index a1f98d3d1..3725799be 100644
+--- a/gcc/config/i386/pmm_malloc.h
++++ b/gcc/config/i386/pmm_malloc.h
+@@ -31,7 +31,7 @@
+ #ifndef __cplusplus
+ extern int posix_memalign (void **, size_t, size_t);
+ #else
+-extern "C" int posix_memalign (void **, size_t, size_t) throw ();
++extern "C" int posix_memalign (void **, size_t, size_t);
+ #endif
+
+ static __inline void *
diff --git a/patches/gcc/6.3.0/977-crystax.patch b/patches/gcc/6.3.0/977-crystax.patch
new file mode 100644
index 0000000..0211d72
--- /dev/null
+++ b/patches/gcc/6.3.0/977-crystax.patch
@@ -0,0 +1,33 @@
+commit 9ae82f7cfc1073820092dd9f957559667e77db0d
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Tue Aug 25 09:36:42 2015 +0300
+
+ [android] Explicitly make _Unwind_Resume visible for arm64/mips64
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
+index db1288fd0..38bc64d61 100644
+--- a/gcc/config/aarch64/aarch64-linux-android.h
++++ b/gcc/config/aarch64/aarch64-linux-android.h
+@@ -57,4 +57,8 @@
+ #define ENDFILE_SPEC \
+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
+
++#ifdef IN_LIBGCC2
++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
++#endif
++
+ #endif /* GCC_AARCH64_LINUX_ANDROID_H */
+diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
+index 8bfacf994..262a9a341 100644
+--- a/gcc/config/mips/linux-common.h
++++ b/gcc/config/mips/linux-common.h
+@@ -63,3 +63,7 @@ along with GCC; see the file COPYING3. If not see
+
+ /* The default value isn't sufficient in 64-bit mode. */
+ #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024)
++
++#ifdef IN_LIBGCC2
++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
++#endif
diff --git a/patches/gcc/7.1.0/951-bionic-ndk.patch b/patches/gcc/7.1.0/951-bionic-ndk.patch
new file mode 100644
index 0000000..59c50a8
--- /dev/null
+++ b/patches/gcc/7.1.0/951-bionic-ndk.patch
@@ -0,0 +1,58 @@
+commit d38d37bdfe24b7ce1bdcb55642fb6b904718e68f
+Author: Howard Chu <hyc@symas.com>
+Date: Tue Apr 25 19:02:18 2017 -0700
+
+ Fix ctype for newer NDK headers
+
+diff --git a/libstdc++-v3/config/os/bionic/ctype_base.h b/libstdc++-v3/config/os/bionic/ctype_base.h
+index 33978f3..c36e63c 100644
+--- a/libstdc++-v3/config/os/bionic/ctype_base.h
++++ b/libstdc++-v3/config/os/bionic/ctype_base.h
+@@ -28,6 +28,18 @@
+
+ // Information as gleaned from /usr/include/ctype.h
+
++// _CTYPE prefix was added in NDK r14 unified headers
++#ifndef _CTYPE_U
++#define _CTYPE_U _U
++#define _CTYPE_L _L
++#define _CTYPE_D _N
++#define _CTYPE_S _S
++#define _CTYPE_P _P
++#define _CTYPE_C _C
++#define _CTYPE_X _X
++#define _CTYPE_B _B
++#endif
++
+ namespace std _GLIBCXX_VISIBILITY(default)
+ {
+ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+@@ -41,17 +53,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+ // NB: Offsets into ctype<char>::_M_table force a particular size
+ // on the mask type. Because of this, we don't use an enum.
+ typedef char mask;
+- static const mask upper = _U;
+- static const mask lower = _L;
+- static const mask alpha = _U | _L;
+- static const mask digit = _N;
+- static const mask xdigit = _X | _N;
+- static const mask space = _S;
+- static const mask print = _P | _U | _L | _N | _B;
+- static const mask graph = _P | _U | _L | _N;
+- static const mask cntrl = _C;
+- static const mask punct = _P;
+- static const mask alnum = _U | _L | _N;
++ static const mask upper = _CTYPE_U;
++ static const mask lower = _CTYPE_L;
++ static const mask alpha = _CTYPE_U | _CTYPE_L;
++ static const mask digit = _CTYPE_D;
++ static const mask xdigit = _CTYPE_X | _CTYPE_D;
++ static const mask space = _CTYPE_S;
++ static const mask print = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D | _CTYPE_B;
++ static const mask graph = _CTYPE_P | _CTYPE_U | _CTYPE_L | _CTYPE_D;
++ static const mask cntrl = _CTYPE_C;
++ static const mask punct = _CTYPE_P;
++ static const mask alnum = _CTYPE_U | _CTYPE_L | _CTYPE_D;
+ #if __cplusplus >= 201103L
+ static const mask blank = space;
+ #endif
diff --git a/patches/gcc/7.1.0/952-bionic-errno.patch b/patches/gcc/7.1.0/952-bionic-errno.patch
new file mode 100644
index 0000000..91f6ca3
--- /dev/null
+++ b/patches/gcc/7.1.0/952-bionic-errno.patch
@@ -0,0 +1,19 @@
+commit 6cd4ad106ef87a3c58b0c3478e78409b47000de0
+Author: Howard Chu <hyc@symas.com>
+Date: Tue Apr 25 20:17:03 2017 -0700
+
+ Fix, errno is volatile int
+
+diff --git a/libstdc++-v3/src/filesystem/dir.cc b/libstdc++-v3/src/filesystem/dir.cc
+index 6ff12d0..5bbd664 100644
+--- a/libstdc++-v3/src/filesystem/dir.cc
++++ b/libstdc++-v3/src/filesystem/dir.cc
+@@ -147,7 +147,7 @@ fs::_Dir::advance(error_code* ec, directory_options options)
+
+ int err = std::exchange(errno, 0);
+ const auto entp = readdir(dirp);
+- std::swap(errno, err);
++ std::swap((int&)errno, err);
+
+ if (entp)
+ {
diff --git a/patches/gcc/7.1.0/970-crystax.patch b/patches/gcc/7.1.0/970-crystax.patch
new file mode 100644
index 0000000..7324d7f
--- /dev/null
+++ b/patches/gcc/7.1.0/970-crystax.patch
@@ -0,0 +1,553 @@
+commit 080803512c8f6f87c2f1f711170d54033144d628
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Wed Jul 29 11:28:29 2015 +0300
+
+ [android] Apply Android-related modifications
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+[Edited: keep libstdc++, drop libcrystax-related modifications]
+diff --git a/gcc/config.gcc b/gcc/config.gcc
+index f66e48cd1..1c253496b 100644
+--- a/gcc/config.gcc
++++ b/gcc/config.gcc
+@@ -942,13 +942,17 @@ aarch64*-*-elf | aarch64*-*-rtems*)
+ TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
+ ;;
+ aarch64*-*-linux*)
+- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h"
+ tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
++ extra_options="${extra_options} linux-android.opt"
+ tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
+ case $target in
+ aarch64_be-*)
+ tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"
+ ;;
++ aarch64*-*-linux-android*)
++ tm_file="${tm_file} aarch64/aarch64-linux-android.h"
++ ;;
+ esac
+ aarch64_multilibs="${with_multilib_list}"
+ if test "$aarch64_multilibs" = "default"; then
+@@ -2055,6 +2059,17 @@ mips*-*-linux*) # Linux MIPS, either endian.
+ tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h"
+ extra_options="${extra_options} linux-android.opt"
+ case ${target} in
++ mips64*android*)
++ default_mips_arch=mips64r6
++ default_mips_abi=64
++ tm_file="${tm_file} mips/android.h"
++ tmake_file="${tmake_file} mips/t-linux-android64"
++ ;;
++ mips*android*)
++ default_mips_arch=mips32
++ tm_file="${tm_file} mips/android.h"
++ tmake_file="$tmake_file mips/t-linux-android"
++ ;;
+ mipsisa32r6*)
+ default_mips_arch=mips32r6
+ ;;
+diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
+new file mode 100644
+index 000000000..db1288fd0
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-linux-android.h
+@@ -0,0 +1,59 @@
++/* Machine description for AArch64 architecture.
++ Copyright (C) 2014 Free Software Foundation, Inc.
++
++ This file is part of GCC.
++
++ GCC is free software; you can redistribute it and/or modify it
++ under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 3, or (at your option)
++ any later version.
++
++ GCC is distributed in the hope that it will be useful, but
++ WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with GCC; see the file COPYING3. If not see
++ <http://www.gnu.org/licenses/>. */
++
++#ifndef GCC_AARCH64_LINUX_ANDROID_H
++#define GCC_AARCH64_LINUX_ANDROID_H
++
++
++#undef TARGET_OS_CPP_BUILTINS
++#define TARGET_OS_CPP_BUILTINS() \
++ do \
++ { \
++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \
++ ANDROID_TARGET_OS_CPP_BUILTINS(); \
++ } \
++ while (0)
++
++#undef LINK_SPEC
++#define LINK_SPEC \
++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \
++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC)
++
++#undef CC1_SPEC
++#define CC1_SPEC \
++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
++
++#define CC1PLUS_SPEC \
++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
++
++#undef LIB_SPEC
++#define LIB_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC)
++
++#undef STARTFILE_SPEC
++#define STARTFILE_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC)
++
++#undef ENDFILE_SPEC
++#define ENDFILE_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
++
++#endif /* GCC_AARCH64_LINUX_ANDROID_H */
+diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
+index 5fcaa59a3..6864195ee 100644
+--- a/gcc/config/aarch64/aarch64-linux.h
++++ b/gcc/config/aarch64/aarch64-linux.h
+@@ -21,7 +21,14 @@
+ #ifndef GCC_AARCH64_LINUX_H
+ #define GCC_AARCH64_LINUX_H
+
+-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
++#ifndef RUNTIME_ROOT_PREFIX
++#define RUNTIME_ROOT_PREFIX ""
++#endif
++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
++#ifdef BIONIC_DYNAMIC_LINKER
++#undef BIONIC_DYNAMIC_LINKER
++#endif
++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64"
+
+ #undef MUSL_DYNAMIC_LINKER
+ #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
+diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
+index ad123dde9..97b059de6 100644
+--- a/gcc/config/arm/arm.h
++++ b/gcc/config/arm/arm.h
+@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes
+
+ #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \
+ || (TARGET_THUMB1 \
++ && !inline_thumb1_jump_table \
+ && (optimize_size || flag_pic)))
+
+ #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \
+- (TARGET_THUMB1 \
++ (TARGET_THUMB1 && !inline_thumb1_jump_table \
+ ? (min >= 0 && max < 512 \
+ ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \
+ : min >= -256 && max < 256 \
+diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
+index 47171b996..eb22d1181 100644
+--- a/gcc/config/arm/arm.md
++++ b/gcc/config/arm/arm.md
+@@ -8179,7 +8179,7 @@
+ (match_operand:SI 2 "const_int_operand" "") ; total range
+ (match_operand:SI 3 "" "") ; table label
+ (match_operand:SI 4 "" "")] ; Out of range label
+- "(TARGET_32BIT || optimize_size || flag_pic) && !target_pure_code"
++ "(TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)) && !target_pure_code"
+ "
+ {
+ enum insn_code code;
+diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
+index 0ebe01743..772453889 100644
+--- a/gcc/config/arm/arm.opt
++++ b/gcc/config/arm/arm.opt
+@@ -193,6 +193,10 @@ mthumb-interwork
+ Target Report Mask(INTERWORK)
+ Support calls between Thumb and ARM instruction sets.
+
++minline-thumb1-jumptable
++Target Report Var(inline_thumb1_jump_table)
++Inline Thumb1 Jump table code
++
+ mtls-dialect=
+ Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU)
+ Specify thread local storage scheme.
+diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h
+index 77f30554d..32158ed65 100644
+--- a/gcc/config/arm/elf.h
++++ b/gcc/config/arm/elf.h
+@@ -56,8 +56,7 @@
+ #undef SUBSUBTARGET_EXTRA_SPECS
+ #define SUBSUBTARGET_EXTRA_SPECS
+
+-#ifndef ASM_SPEC
+-#define ASM_SPEC "\
++#define DEFAULT_ASM_SPEC "\
+ %{mbig-endian:-EB} \
+ %{mlittle-endian:-EL} \
+ %(asm_cpu_spec) \
+@@ -66,6 +65,9 @@
+ %{mthumb-interwork:-mthumb-interwork} \
+ %{mfloat-abi=*} %{mfpu=*} \
+ %(subtarget_extra_asm_spec)"
++
++#ifndef ASM_SPEC
++#define ASM_SPEC DEFAULT_ASM_SPEC
+ #endif
+
+ /* The ARM uses @ are a comment character so we need to redefine
+@@ -104,8 +106,9 @@
+ the code more efficient, but for Thumb-1 it's better to put them out of
+ band unless we are generating compressed tables. */
+ #define JUMP_TABLES_IN_TEXT_SECTION \
+- ((TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic))) \
+- && !target_pure_code)
++ ((TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \
++ && (optimize_size || flag_pic))) \
++ && !target_pure_code)
+
+ #ifndef LINK_SPEC
+ #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X"
+diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
+index ace84816e..8c8fa6553 100644
+--- a/gcc/config/arm/linux-eabi.h
++++ b/gcc/config/arm/linux-eabi.h
+@@ -108,11 +108,16 @@
+ #define CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \
+ GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \
+- ANDROID_CC1_SPEC)
++ ANDROID_CC1_SPEC("-fpic"))
+
+ #define CC1PLUS_SPEC \
+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
+
++#undef ASM_SPEC
++#define ASM_SPEC \
++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \
++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC)
++
+ #undef LIB_SPEC
+ #define LIB_SPEC \
+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
+diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h
+index fee33a3ef..22fb2ced9 100644
+--- a/gcc/config/i386/gnu-user.h
++++ b/gcc/config/i386/gnu-user.h
+@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see
+ When the -shared link option is used a final link is not being
+ done. */
+
++#undef ANDROID_TARGET_CC1_SPEC
++#define ANDROID_TARGET_CC1_SPEC \
++ " -mssse3 -fno-short-enums " \
++
+ #undef ASM_SPEC
+ #define ASM_SPEC \
+- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \
++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
+
+ #undef SUBTARGET_EXTRA_SPECS
+ #define SUBTARGET_EXTRA_SPECS \
+diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h
+index 7a02a7eb4..cac4179bc 100644
+--- a/gcc/config/i386/gnu-user64.h
++++ b/gcc/config/i386/gnu-user64.h
+@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define SPEC_X32 "mx32"
+ #endif
+
++#undef ANDROID_TARGET_CC1_SPEC
++#define ANDROID_TARGET_CC1_SPEC \
++ "%{m32:-mssse3 -fno-short-enums}" \
++ "%{!m32:-msse4.2 -mpopcnt}"
++
+ #undef ASM_SPEC
+ #define ASM_SPEC "%{" SPEC_32 ":--32} \
+ %{" SPEC_64 ":--64} \
+diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h
+index 4b9910fa9..3b11ed086 100644
+--- a/gcc/config/i386/linux-common.h
++++ b/gcc/config/i386/linux-common.h
+@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see
+ #undef CC1_SPEC
+ #define CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
+- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
++ GNU_USER_TARGET_CC1_SPEC \
++ ANDROID_TARGET_CC1_SPEC \
++ " " \
++ ANDROID_CC1_SPEC("-fPIC"))
++
++#define CC1PLUS_SPEC \
++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
+
+ #undef LINK_SPEC
+ #define LINK_SPEC \
+diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h
+index 301a41ccd..9623c88d0 100644
+--- a/gcc/config/linux-android.h
++++ b/gcc/config/linux-android.h
+@@ -38,15 +39,18 @@
+ "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}"
+
+ #define ANDROID_LINK_SPEC \
+- "%{shared: -Bsymbolic}"
++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now"
+
+-#define ANDROID_CC1_SPEC \
++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \
+ "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \
+- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}"
++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}"
+
+ #define ANDROID_CC1PLUS_SPEC \
+- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \
+- "%{!frtti:%{!fno-rtti: -fno-rtti}}"
++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \
++ "%{!frtti:%{!fno-rtti: -frtti}}"
++
++#define ANDROID_ASM_SPEC \
++ "--noexecstack"
+
+ #define ANDROID_LIB_SPEC \
+ "%{!static: -ldl}"
+diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h
+new file mode 100644
+index 000000000..32c539c8d
+--- /dev/null
++++ b/gcc/config/mips/android.h
+@@ -0,0 +1,49 @@
++/* Target macros for mips*-*android* targets.
++ Copyright (C) 2014 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3. If not see
++<http://www.gnu.org/licenses/>. */
++
++#undef DRIVER_SELF_SPECS
++#define DRIVER_SELF_SPECS \
++ /* Make sure a -mips option is present. This helps us to pick \
++ the right multilib, and also makes the later specs easier \
++ to write. */ \
++ MIPS_ISA_LEVEL_SPEC, \
++ \
++ /* Infer the default float setting from -march. */ \
++ MIPS_ARCH_FLOAT_SPEC, \
++ \
++ /* Infer the -msynci setting from -march if not explicitly set. */ \
++ MIPS_ISA_SYNCI_SPEC, \
++ \
++ /* If no ABI option is specified, infer one from the ISA level \
++ or -mgp setting. */ \
++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \
++ \
++ /* If no FP ABI option is specified, infer one from the \
++ ABI/ISA level unless there is a conflicting option. */ \
++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \
++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \
++ \
++ /* If no odd-spreg option is specified, infer one from the ISA. */ \
++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \
++ \
++ /* Base SPECs. */ \
++ BASE_DRIVER_SELF_SPECS, \
++ \
++ /* Use the standard linux specs for everything else. */ \
++ LINUX_DRIVER_SELF_SPECS
+diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h
+index 15b549c08..4a2816014 100644
+--- a/gcc/config/mips/gnu-user.h
++++ b/gcc/config/mips/gnu-user.h
+@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see
+ /* The GNU C++ standard library requires this. */ \
+ if (c_dialect_cxx ()) \
+ builtin_define ("_GNU_SOURCE"); \
++ ANDROID_TARGET_OS_CPP_BUILTINS(); \
+ } while (0)
+
+ #undef SUBTARGET_CPP_SPEC
+@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see
+
+ #undef SUBTARGET_ASM_SPEC
+ #define SUBTARGET_ASM_SPEC \
+- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}"
++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \
++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
+
+ /* The MIPS assembler has different syntax for .set. We set it to
+ .dummy to trap any errors. */
+@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
+ #endif
+
+ #define LINUX_DRIVER_SELF_SPECS \
+- NO_SHARED_SPECS \
++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \
+ MARCH_MTUNE_NATIVE_SPECS, \
+ /* -mplt has no effect without -mno-shared. Simplify later \
+ specs handling by removing a redundant option. */ \
+diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
+index 8429a7ca2..8bfacf994 100644
+--- a/gcc/config/mips/linux-common.h
++++ b/gcc/config/mips/linux-common.h
+@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see
+ #undef SUBTARGET_CC1_SPEC
+ #define SUBTARGET_CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
+- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
+
+ #undef CC1PLUS_SPEC
+ #define CC1PLUS_SPEC \
+diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android
+new file mode 100644
+index 000000000..39f512c81
+--- /dev/null
++++ b/gcc/config/mips/t-linux-android
+@@ -0,0 +1,3 @@
++MULTILIB_OPTIONS = mips32r2/mips32r6
++MULTILIB_DIRNAMES = mips-r2 mips-r6
++MULTILIB_OSDIRNAMES = ../libr2 ../libr6
+diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64
+new file mode 100644
+index 000000000..55cab7d62
+--- /dev/null
++++ b/gcc/config/mips/t-linux-android64
+@@ -0,0 +1,4 @@
++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6
++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6
++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64
++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6
+diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
+index 555c0fe24..47c8655f9 100644
+--- a/libgcc/gthr-posix.h
++++ b/libgcc/gthr-posix.h
+@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define __GTHREADS 1
+ #define __GTHREADS_CXX0X 1
+
++/* The following should normally be in a different header file,
++ * but I couldn't find the right location. The point of the macro
++ * definition below is to prevent libsupc++ and libstdc++ to reference
++ * weak symbols in their static C++ constructors. Such code crashes
++ * when a shared object linked statically to these libraries is
++ * loaded on Android 2.1 (Eclair) and older platform releases, due
++ * to a dynamic linker bug.
++ */
++#ifdef __ANDROID__
++#undef GTHREAD_USE_WEAK
++#define GTHREAD_USE_WEAK 0
++#endif
++
+ #include <pthread.h>
+
+ #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \
+diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
+index 41797a971..f746e8353 100755
+--- a/libstdc++-v3/configure
++++ b/libstdc++-v3/configure
+@@ -78319,6 +78341,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ /* end confdefs.h. */
+ #include <sys/syscall.h>
+ int lk;
++#if !defined(SYS_gettid)
++#define SYS_gettid __NR_gettid
++#endif
++#if !defined(SYS_futex)
++#define SYS_futex __NR_futex
++#endif
+ int
+ main ()
+ {
+@@ -78377,6 +78405,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ /* end confdefs.h. */
+ #include <sys/syscall.h>
+ int lk;
++#if !defined(SYS_gettid)
++#define SYS_gettid __NR_gettid
++#endif
++#if !defined(SYS_futex)
++#define SYS_futex __NR_futex
++#endif
+ int
+ main ()
+ {
+diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h
+index e3e206b7d..e85dc2c76 100644
+--- a/libstdc++-v3/include/bits/locale_facets.h
++++ b/libstdc++-v3/include/bits/locale_facets.h
+@@ -47,6 +47,20 @@
+ #include <ext/numeric_traits.h>
+ #include <bits/streambuf_iterator.h>
+
++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__
++// CrystaX: for some reason, x86 gcc-4.9 makes ctype<char>::do_widen() and
++// ctype<char>::_M_widen_init() methods working wrong if optimization enabled.
++// For ctype<char>::do_widen(), values of passed arguments (__lo, __hi and __to)
++// are completely messed up and don't correspond to passed values. In case if
++// we disable optimization for those methods, things become correct so we apply
++// this workaround here for a time.
++// TODO: figure out what exactly wrong here - is it bug in GCC optimization
++// algorithm or smth else?
++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0)))
++#else
++#define __CRYSTAX_X86_DONT_OPTIMIZE
++#endif
++
+ namespace std _GLIBCXX_VISIBILITY(default)
+ {
+ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+ * @return @a __hi.
+ */
+ virtual const char*
+- do_widen(const char* __lo, const char* __hi, char_type* __to) const
++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE
+ {
+ __builtin_memcpy(__to, __lo, __hi - __lo);
+ return __hi;
+@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+
+ private:
+ void _M_narrow_init() const;
+- void _M_widen_init() const;
++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE;
+ };
+
+ #ifdef _GLIBCXX_USE_WCHAR_T
+diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc
+index 9b617998f..c149169bb 100644
+--- a/libstdc++-v3/libsupc++/guard.cc
++++ b/libstdc++-v3/libsupc++/guard.cc
+@@ -33,7 +33,12 @@
+ #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \
+ && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX)
+ # include <climits>
++#if defined(__ANDROID__)
++# include <sys/syscall.h>
++# define SYS_futex __NR_futex
++#else
+ # include <syscall.h>
++#endif
+ # include <unistd.h>
+ # define _GLIBCXX_USE_FUTEX
+ # define _GLIBCXX_FUTEX_WAIT 0
diff --git a/patches/gcc/7.1.0/971-crystax.patch b/patches/gcc/7.1.0/971-crystax.patch
new file mode 100644
index 0000000..748a381
--- /dev/null
+++ b/patches/gcc/7.1.0/971-crystax.patch
@@ -0,0 +1,25 @@
+commit 9f057b62caafe08c968103d39b5df82486a175c2
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Thu Aug 13 16:11:54 2015 +0300
+
+ [android] Add additional multilib option: mfloat-abi=hard
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi
+index 8f1307c55..cbbec5bd2 100644
+--- a/gcc/config/arm/t-linux-androideabi
++++ b/gcc/config/arm/t-linux-androideabi
+@@ -1,8 +1,9 @@
+-MULTILIB_OPTIONS = march=armv7-a mthumb
+-MULTILIB_DIRNAMES = armv7-a thumb
+-MULTILIB_EXCEPTIONS =
++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard
++MULTILIB_DIRNAMES = armv7-a thumb hard
++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard*
+ MULTILIB_MATCHES =
+ MULTILIB_OSDIRNAMES =
++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch
+
+ # The "special" multilib can be used to build native applications for Android,
+ # as opposed to native shared libraries that are then called via JNI.
diff --git a/patches/gcc/7.1.0/972-crystax.patch b/patches/gcc/7.1.0/972-crystax.patch
new file mode 100644
index 0000000..b9077be
--- /dev/null
+++ b/patches/gcc/7.1.0/972-crystax.patch
@@ -0,0 +1,302 @@
+commit 44a81ebb7698dac41ffa7acd5e0cc1578e5ab1fd
+Author: H.J. Lu <hongjiu.lu@intel.com>
+Date: Mon Apr 14 15:59:47 2014 -0700
+
+ [android] Always enable --eh-frame-hdr for static executable
+
+ See 5e6cdf76af295c9a39b695ca228cff675e8ff4ae and
+ 23e3137ee2897464b051599b85a09f130d3ad05d
+
+ Change-Id: Ibda473188e5a10f2a0592f2494ad00ad1f91e04b
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config.in b/gcc/config.in
+index 115cb6163..933916833 100644
+--- a/gcc/config.in
++++ b/gcc/config.in
+@@ -2119,6 +2119,12 @@
+ #endif
+
+
++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */
++#ifndef USED_FOR_TARGET
++#undef USE_EH_FRAME_HDR_FOR_STATIC
++#endif
++
++
+ /* Define to 1 if the 'long long' type is wider than 'long' but still
+ efficiently supported by the host hardware. */
+ #ifndef USED_FOR_TARGET
+diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h
+index 093c38bba..54b3e0c91 100644
+--- a/gcc/config/alpha/elf.h
++++ b/gcc/config/alpha/elf.h
+@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number;
+ I imagine that other systems will catch up. In the meantime, it
+ doesn't harm to make sure that the data exists to be used later. */
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h
+index 5ded869d2..5f51ac81d 100644
+--- a/gcc/config/freebsd.h
++++ b/gcc/config/freebsd.h
+@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see
+ #define LIB_SPEC FBSD_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #ifdef TARGET_LIBC_PROVIDES_SSP
+ #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \
+diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h
+index b0bf40a95..d1874bc29 100644
+--- a/gcc/config/gnu-user.h
++++ b/gcc/config/gnu-user.h
+@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #undef LINK_GCC_C_SEQUENCE_SPEC
+ #define LINK_GCC_C_SEQUENCE_SPEC \
+diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h
+index 37ecfc43f..a5f1b9955 100644
+--- a/gcc/config/openbsd.h
++++ b/gcc/config/openbsd.h
+@@ -136,8 +136,12 @@ while (0)
+ #define LIB_SPEC OBSD_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #undef LIB_SPEC
+ #define LIB_SPEC OBSD_LIB_SPEC
+diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h
+index cbf909722..eb2217fad 100644
+--- a/gcc/config/rs6000/sysv4.h
++++ b/gcc/config/rs6000/sysv4.h
+@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
+ -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}"
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
+-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++# ifdef USE_EH_FRAME_HDR_FOR_STATIC
++# define LINK_EH_SPEC "--eh-frame-hdr "
++# else
++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++# endif
+ #endif
+
+ #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \
+diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
+index 5160e1fda..7632a5081 100644
+--- a/gcc/config/sol2.h
++++ b/gcc/config/sol2.h
+@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see
+ /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs
+ --eh-frame-hdr to create the required .eh_frame_hdr sections. */
+ #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++#endif
+ #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */
+ #endif
+
+diff --git a/gcc/configure b/gcc/configure
+index 1c6e3407c..28ad05004 100755
+--- a/gcc/configure
++++ b/gcc/configure
+@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769
+ enable_fix_cortex_a53_843419
+ with_glibc_version
+ enable_gnu_unique_object
++enable_eh_frame_hdr_for_static
+ enable_linker_build_id
+ enable_default_ssp
+ with_long_double_128
+@@ -1670,6 +1671,9 @@ Optional Features:
+ --enable-gnu-unique-object
+ enable the use of the @gnu_unique_object ELF
+ extension on glibc systems
++ --enable-eh-frame-hdr-for-static
++ enable linker PT_GNU_EH_FRAME support for static
++ executable
+ --enable-linker-build-id
+ compiler will always pass --build-id to linker
+ --enable-default-ssp enable Stack Smashing Protection as default
+@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
+
+ $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h
+
++ # Check whether --enable-eh-frame-hdr-for-static was given.
++if test "${enable_eh_frame_hdr_for_static+set}" = set; then :
++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in
++ yes | no) ;;
++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid
++value for --enable-eh-frame-hdr-for-static.
++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;;
++ esac
++else
++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
++# Linux kernel.
++ if test x$host = x$build -a x$host = x$target &&
++ ldd --version 2>&1 >/dev/null &&
++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
++ if test "$glibcnum" -ge 2003 ; then
++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
++ if echo "$auvx" | grep AT_PHDR > /dev/null &&
++ echo "$auvx" | grep AT_PHNUM > /dev/null; then
++ enable_eh_frame_hdr_for_static=yes
++ fi
++ fi
++ fi
++fi
++
++ if test x$enable_eh_frame_hdr_for_static = xyes; then
++
++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h
++
++ fi
+ fi
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5
+ $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; }
+diff --git a/gcc/configure.ac b/gcc/configure.ac
+index 6c1dcd9ae..0cf7419e7 100644
+--- a/gcc/configure.ac
++++ b/gcc/configure.ac
+@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR])
+ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
+ AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1,
+ [Define if your linker supports .eh_frame_hdr.])
++ AC_ARG_ENABLE(eh-frame-hdr-for-static,
++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static],
++ [enable linker PT_GNU_EH_FRAME support for static executable])],
++ [case $enable_eh_frame_hdr_for_static in
++ yes | no) ;;
++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid
++value for --enable-eh-frame-hdr-for-static.
++Valid choices are 'yes' and 'no'.]) ;;
++ esac],
++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
++# Linux kernel.
++ [[if test x$host = x$build -a x$host = x$target &&
++ ldd --version 2>&1 >/dev/null &&
++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
++ if test "$glibcnum" -ge 2003 ; then
++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
++ if echo "$auvx" | grep AT_PHDR > /dev/null &&
++ echo "$auvx" | grep AT_PHNUM > /dev/null; then
++ enable_eh_frame_hdr_for_static=yes
++ fi
++ fi
++ fi]])
++ if test x$enable_eh_frame_hdr_for_static = xyes; then
++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1,
++[Define if your system supports PT_GNU_EH_FRAME for static executable.])
++ fi
+ fi
+ AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr)
+
+diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C
+new file mode 100644
+index 000000000..15408effa
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C
+@@ -0,0 +1,25 @@
++// PR c++/4381
++// Test that exception-specs work properly for classes with virtual bases.
++
++// { dg-do run }
++// { dg-options "-static" }
++
++class Base {};
++
++struct A : virtual public Base
++{
++ A() {}
++};
++
++struct B {};
++
++void func() throw (B,A)
++{
++ throw A();
++}
++
++int main(void)
++{
++ try { func(); }
++ catch (A& a) { }
++}
+diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c
+index f3343fc4f..d42647779 100644
+--- a/libgcc/crtstuff.c
++++ b/libgcc/crtstuff.c
+@@ -88,7 +88,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(BSD_DL_ITERATE_PHDR_AVAILABLE)
+ #include <link.h>
+ # define USE_PT_GNU_EH_FRAME
+@@ -97,7 +98,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(__sun__) && defined(__svr4__)
+ #include <link.h>
+ # define USE_PT_GNU_EH_FRAME
+@@ -106,7 +108,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(__GLIBC__) && __GLIBC__ >= 2
+ #include <link.h>
+ /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
+@@ -121,7 +124,7 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(CRTSTUFFT_O) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(inhibit_libc) \
+ && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__))
+ /* On systems using glibc, an inhibit_libc build of libgcc is only
diff --git a/patches/gcc/7.1.0/973-crystax.patch b/patches/gcc/7.1.0/973-crystax.patch
new file mode 100644
index 0000000..b96ece3
--- /dev/null
+++ b/patches/gcc/7.1.0/973-crystax.patch
@@ -0,0 +1,20 @@
+commit 778a9ef107f51544d583f110e92b75f4d9d79117
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Thu Aug 20 19:11:07 2015 +0300
+
+ [android] Don't use PIE copyrelocs for x86/x86_64
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
+index 3d044e8bd..5c89fcab0 100644
+--- a/gcc/config/i386/i386.c
++++ b/gcc/config/i386/i386.c
+@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp)
+ else if (!SYMBOL_REF_FAR_ADDR_P (op0)
+ && (SYMBOL_REF_LOCAL_P (op0)
+ || (HAVE_LD_PIE_COPYRELOC
++ && !TARGET_HAS_BIONIC
+ && flag_pie
+ && !SYMBOL_REF_WEAK (op0)
+ && !SYMBOL_REF_FUNCTION_P (op0)))
diff --git a/patches/gcc/7.1.0/974-crystax.patch b/patches/gcc/7.1.0/974-crystax.patch
new file mode 100644
index 0000000..9db4f54
--- /dev/null
+++ b/patches/gcc/7.1.0/974-crystax.patch
@@ -0,0 +1,24 @@
+commit dbeae1190cabad83999f2540523f045acc1bb4ec
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Fri Aug 21 17:41:59 2015 +0300
+
+ [android] Always use gthr-posix.h instead of gthr-default.h
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/libgcc/gthr.h b/libgcc/gthr.h
+index 47a7d061a..67a680f90 100644
+--- a/libgcc/gthr.h
++++ b/libgcc/gthr.h
+@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define GTHREAD_USE_WEAK 1
+ #endif
+ #endif
++#if __ANDROID__
++#include "gthr-posix.h"
++#else
+ #include "gthr-default.h"
++#endif
+
+ #ifndef HIDE_EXPORTS
+ #pragma GCC visibility pop
diff --git a/patches/gcc/7.1.0/975-crystax.patch b/patches/gcc/7.1.0/975-crystax.patch
new file mode 100644
index 0000000..9efc2a4
--- /dev/null
+++ b/patches/gcc/7.1.0/975-crystax.patch
@@ -0,0 +1,31 @@
+commit 8a66d422721ae5999737d7825701ff22097d287b
+Author: Andrew Hsieh <andrewhsieh@google.com>
+Date: Mon Apr 14 21:05:51 2014 -0700
+
+ [android] Fix ARM generates insufficient alignment for NEON vst/vld
+
+ See d909af3e2469aad87d5c3e79b93c778fd26c03a9
+
+ Change-Id: Ie1de9f946f397196bb6f1623f5add86933739484
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
+index 5974c65d3..71b2c7aa9 100644
+--- a/gcc/config/arm/arm.c
++++ b/gcc/config/arm/arm.c
+@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code)
+ memsize = MEM_SIZE (x);
+
+ /* Only certain alignment specifiers are supported by the hardware. */
+- if (memsize == 32 && (align % 32) == 0)
++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC
++ honors stricter alignment of composite type in user code, it doesn't
++ observe the alignment of memory passed as an extra argument for function
++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */
++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED)
+ align_bits = 256;
+- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED)
+ align_bits = 128;
+ else if (memsize >= 8 && (align % 8) == 0)
+ align_bits = 64;
diff --git a/patches/gcc/7.1.0/976-crystax.patch b/patches/gcc/7.1.0/976-crystax.patch
new file mode 100644
index 0000000..790d4a9
--- /dev/null
+++ b/patches/gcc/7.1.0/976-crystax.patch
@@ -0,0 +1,21 @@
+commit 89d27bc45ee7325dcfff6748da0f8b9c1dc1f234
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Sat Aug 22 09:55:55 2015 +0300
+
+ [android][i386] Remove throw() declaration from posix_memalign() proto
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h
+index a1f98d3d1..3725799be 100644
+--- a/gcc/config/i386/pmm_malloc.h
++++ b/gcc/config/i386/pmm_malloc.h
+@@ -31,7 +31,7 @@
+ #ifndef __cplusplus
+ extern int posix_memalign (void **, size_t, size_t);
+ #else
+-extern "C" int posix_memalign (void **, size_t, size_t) throw ();
++extern "C" int posix_memalign (void **, size_t, size_t);
+ #endif
+
+ static __inline void *
diff --git a/patches/gcc/7.1.0/977-crystax.patch b/patches/gcc/7.1.0/977-crystax.patch
new file mode 100644
index 0000000..0211d72
--- /dev/null
+++ b/patches/gcc/7.1.0/977-crystax.patch
@@ -0,0 +1,33 @@
+commit 9ae82f7cfc1073820092dd9f957559667e77db0d
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Tue Aug 25 09:36:42 2015 +0300
+
+ [android] Explicitly make _Unwind_Resume visible for arm64/mips64
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
+index db1288fd0..38bc64d61 100644
+--- a/gcc/config/aarch64/aarch64-linux-android.h
++++ b/gcc/config/aarch64/aarch64-linux-android.h
+@@ -57,4 +57,8 @@
+ #define ENDFILE_SPEC \
+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
+
++#ifdef IN_LIBGCC2
++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
++#endif
++
+ #endif /* GCC_AARCH64_LINUX_ANDROID_H */
+diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
+index 8bfacf994..262a9a341 100644
+--- a/gcc/config/mips/linux-common.h
++++ b/gcc/config/mips/linux-common.h
+@@ -63,3 +63,7 @@ along with GCC; see the file COPYING3. If not see
+
+ /* The default value isn't sufficient in 64-bit mode. */
+ #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024)
++
++#ifdef IN_LIBGCC2
++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
++#endif
diff --git a/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch b/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch
deleted file mode 100644
index 1ae1186..0000000
--- a/patches/gcc/linaro-6.3-2017.02/950-bionic-android.patch
+++ /dev/null
@@ -1,17570 +0,0 @@
-Based on https://github.com/crystax/android-toolchain-gcc-6/commits/master
-
-Up to commit e510dbdc79714512ac87126f1832f366d56623b6 but with
-Crystax-specific lib support omitted
-diff --git a/gcc/config.gcc b/gcc/config.gcc
-index f66e48c..4380657 100644
---- a/gcc/config.gcc
-+++ b/gcc/config.gcc
-@@ -374,6 +374,7 @@ i[34567]86-*-*)
- rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h
- adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h
- avx512cdintrin.h avx512erintrin.h avx512pfintrin.h
-+ arm_neon.h
- shaintrin.h clflushoptintrin.h xsavecintrin.h
- xsavesintrin.h avx512dqintrin.h avx512bwintrin.h
- avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h
-@@ -396,6 +397,7 @@ x86_64-*-*)
- rtmintrin.h xtestintrin.h rdseedintrin.h prfchwintrin.h
- adxintrin.h fxsrintrin.h xsaveintrin.h xsaveoptintrin.h
- avx512cdintrin.h avx512erintrin.h avx512pfintrin.h
-+ arm_neon.h
- shaintrin.h clflushoptintrin.h xsavecintrin.h
- xsavesintrin.h avx512dqintrin.h avx512bwintrin.h
- avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h
-@@ -942,13 +944,17 @@ aarch64*-*-elf | aarch64*-*-rtems*)
- TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
- ;;
- aarch64*-*-linux*)
-- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
-+ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h"
- tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
-+ extra_options="${extra_options} linux-android.opt"
- tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
- case $target in
- aarch64_be-*)
- tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"
- ;;
-+ aarch64*-*-linux-android*)
-+ tm_file="${tm_file} aarch64/aarch64-linux-android.h"
-+ ;;
- esac
- aarch64_multilibs="${with_multilib_list}"
- if test "$aarch64_multilibs" = "default"; then
-@@ -2055,6 +2061,17 @@ mips*-*-linux*) # Linux MIPS, either endian.
- tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h"
- extra_options="${extra_options} linux-android.opt"
- case ${target} in
-+ mips64*android*)
-+ default_mips_arch=mips64r6
-+ default_mips_abi=64
-+ tm_file="${tm_file} mips/android.h"
-+ tmake_file="${tmake_file} mips/t-linux-android64"
-+ ;;
-+ mips*android*)
-+ default_mips_arch=mips32
-+ tm_file="${tm_file} mips/android.h"
-+ tmake_file="$tmake_file mips/t-linux-android"
-+ ;;
- mipsisa32r6*)
- default_mips_arch=mips32r6
- ;;
-diff --git a/gcc/config.in b/gcc/config.in
-index 115cb61..9339168 100644
---- a/gcc/config.in
-+++ b/gcc/config.in
-@@ -2119,6 +2119,12 @@
- #endif
-
-
-+/* Define if your system supports PT_GNU_EH_FRAME for static executable. */
-+#ifndef USED_FOR_TARGET
-+#undef USE_EH_FRAME_HDR_FOR_STATIC
-+#endif
-+
-+
- /* Define to 1 if the 'long long' type is wider than 'long' but still
- efficiently supported by the host hardware. */
- #ifndef USED_FOR_TARGET
-diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
-new file mode 100644
-index 0000000..cffd84d
---- /dev/null
-+++ b/gcc/config/aarch64/aarch64-linux-android.h
-@@ -0,0 +1,63 @@
-+/* Machine description for AArch64 architecture.
-+ Copyright (C) 2014 Free Software Foundation, Inc.
-+
-+ This file is part of GCC.
-+
-+ GCC is free software; you can redistribute it and/or modify it
-+ under the terms of the GNU General Public License as published by
-+ the Free Software Foundation; either version 3, or (at your option)
-+ any later version.
-+
-+ GCC is distributed in the hope that it will be useful, but
-+ WITHOUT ANY WARRANTY; without even the implied warranty of
-+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ General Public License for more details.
-+
-+ You should have received a copy of the GNU General Public License
-+ along with GCC; see the file COPYING3. If not see
-+ <http://www.gnu.org/licenses/>. */
-+
-+#ifndef GCC_AARCH64_LINUX_ANDROID_H
-+#define GCC_AARCH64_LINUX_ANDROID_H
-+
-+
-+#undef TARGET_OS_CPP_BUILTINS
-+#define TARGET_OS_CPP_BUILTINS() \
-+ do \
-+ { \
-+ GNU_USER_TARGET_OS_CPP_BUILTINS(); \
-+ ANDROID_TARGET_OS_CPP_BUILTINS(); \
-+ } \
-+ while (0)
-+
-+#undef LINK_SPEC
-+#define LINK_SPEC \
-+ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \
-+ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC)
-+
-+#undef CC1_SPEC
-+#define CC1_SPEC \
-+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
-+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
-+
-+#define CC1PLUS_SPEC \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
-+
-+#undef LIB_SPEC
-+#define LIB_SPEC \
-+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
-+ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC)
-+
-+#undef STARTFILE_SPEC
-+#define STARTFILE_SPEC \
-+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC)
-+
-+#undef ENDFILE_SPEC
-+#define ENDFILE_SPEC \
-+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
-+
-+#ifdef IN_LIBGCC2
-+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
-+#endif
-+
-+#endif /* GCC_AARCH64_LINUX_ANDROID_H */
-diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
-index 5fcaa59..6864195 100644
---- a/gcc/config/aarch64/aarch64-linux.h
-+++ b/gcc/config/aarch64/aarch64-linux.h
-@@ -21,7 +21,14 @@
- #ifndef GCC_AARCH64_LINUX_H
- #define GCC_AARCH64_LINUX_H
-
--#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
-+#ifndef RUNTIME_ROOT_PREFIX
-+#define RUNTIME_ROOT_PREFIX ""
-+#endif
-+#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
-+#ifdef BIONIC_DYNAMIC_LINKER
-+#undef BIONIC_DYNAMIC_LINKER
-+#endif
-+#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64"
-
- #undef MUSL_DYNAMIC_LINKER
- #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
-diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h
-index 093c38b..54b3e0c 100644
---- a/gcc/config/alpha/elf.h
-+++ b/gcc/config/alpha/elf.h
-@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number;
- I imagine that other systems will catch up. In the meantime, it
- doesn't harm to make sure that the data exists to be used later. */
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
-index 5974c65..71b2c7a 100644
---- a/gcc/config/arm/arm.c
-+++ b/gcc/config/arm/arm.c
-@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code)
- memsize = MEM_SIZE (x);
-
- /* Only certain alignment specifiers are supported by the hardware. */
-- if (memsize == 32 && (align % 32) == 0)
-+ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC
-+ honors stricter alignment of composite type in user code, it doesn't
-+ observe the alignment of memory passed as an extra argument for function
-+ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */
-+ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED)
- align_bits = 256;
-- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
-+ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED)
- align_bits = 128;
- else if (memsize >= 8 && (align % 8) == 0)
- align_bits = 64;
-diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
-index ad123dd..97b059d 100644
---- a/gcc/config/arm/arm.h
-+++ b/gcc/config/arm/arm.h
-@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes
-
- #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \
- || (TARGET_THUMB1 \
-+ && !inline_thumb1_jump_table \
- && (optimize_size || flag_pic)))
-
- #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \
-- (TARGET_THUMB1 \
-+ (TARGET_THUMB1 && !inline_thumb1_jump_table \
- ? (min >= 0 && max < 512 \
- ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \
- : min >= -256 && max < 256 \
-diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
-index 47171b9..eb22d11 100644
---- a/gcc/config/arm/arm.md
-+++ b/gcc/config/arm/arm.md
-@@ -8179,7 +8179,7 @@
- (match_operand:SI 2 "const_int_operand" "") ; total range
- (match_operand:SI 3 "" "") ; table label
- (match_operand:SI 4 "" "")] ; Out of range label
-- "TARGET_32BIT || optimize_size || flag_pic"
-+ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)"
- "
- {
- enum insn_code code;
-diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
-index 0ebe017..7724538 100644
---- a/gcc/config/arm/arm.opt
-+++ b/gcc/config/arm/arm.opt
-@@ -193,6 +193,10 @@ mthumb-interwork
- Target Report Mask(INTERWORK)
- Support calls between Thumb and ARM instruction sets.
-
-+minline-thumb1-jumptable
-+Target Report Var(inline_thumb1_jump_table)
-+Inline Thumb1 Jump table code
-+
- mtls-dialect=
- Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU)
- Specify thread local storage scheme.
-diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h
-index 77f3055..32158ed 100644
---- a/gcc/config/arm/elf.h
-+++ b/gcc/config/arm/elf.h
-@@ -56,8 +56,7 @@
- #undef SUBSUBTARGET_EXTRA_SPECS
- #define SUBSUBTARGET_EXTRA_SPECS
-
--#ifndef ASM_SPEC
--#define ASM_SPEC "\
-+#define DEFAULT_ASM_SPEC "\
- %{mbig-endian:-EB} \
- %{mlittle-endian:-EL} \
- %(asm_cpu_spec) \
-@@ -66,6 +65,9 @@
- %{mthumb-interwork:-mthumb-interwork} \
- %{mfloat-abi=*} %{mfpu=*} \
- %(subtarget_extra_asm_spec)"
-+
-+#ifndef ASM_SPEC
-+#define ASM_SPEC DEFAULT_ASM_SPEC
- #endif
-
- /* The ARM uses @ are a comment character so we need to redefine
-@@ -104,7 +106,8 @@
- the code more efficient, but for Thumb-1 it's better to put them out of
- band unless we are generating compressed tables. */
- #define JUMP_TABLES_IN_TEXT_SECTION \
-- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic)))
-+ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \
-+ && (optimize_size || flag_pic)))
-
- #ifndef LINK_SPEC
- #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X"
-diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
-index ace8481..ed7f4d0 100644
---- a/gcc/config/arm/linux-eabi.h
-+++ b/gcc/config/arm/linux-eabi.h
-@@ -108,11 +108,16 @@
- #define CC1_SPEC \
- LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \
- GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \
-- ANDROID_CC1_SPEC)
-+ ANDROID_CC1_SPEC("-fpic"))
-
- #define CC1PLUS_SPEC \
- LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
-
-+#undef ASM_SPEC
-+#define ASM_SPEC \
-+ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \
-+ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC)
-+
- #undef LIB_SPEC
- #define LIB_SPEC \
- LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
-diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi
-index 8f1307c..cbbec5b 100644
---- a/gcc/config/arm/t-linux-androideabi
-+++ b/gcc/config/arm/t-linux-androideabi
-@@ -1,8 +1,9 @@
--MULTILIB_OPTIONS = march=armv7-a mthumb
--MULTILIB_DIRNAMES = armv7-a thumb
--MULTILIB_EXCEPTIONS =
-+MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard
-+MULTILIB_DIRNAMES = armv7-a thumb hard
-+MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard*
- MULTILIB_MATCHES =
- MULTILIB_OSDIRNAMES =
-+MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch
-
- # The "special" multilib can be used to build native applications for Android,
- # as opposed to native shared libraries that are then called via JNI.
-diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h
-index 5ded869..5f51ac8 100644
---- a/gcc/config/freebsd.h
-+++ b/gcc/config/freebsd.h
-@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see
- #define LIB_SPEC FBSD_LIB_SPEC
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-
- #ifdef TARGET_LIBC_PROVIDES_SSP
- #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \
-diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h
-index b0bf40a..d1874bc 100644
---- a/gcc/config/gnu-user.h
-+++ b/gcc/config/gnu-user.h
-@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-
- #undef LINK_GCC_C_SEQUENCE_SPEC
- #define LINK_GCC_C_SEQUENCE_SPEC \
-diff --git a/gcc/config/i386/arm_neon.h b/gcc/config/i386/arm_neon.h
-new file mode 100644
-index 0000000..10944f3
---- /dev/null
-+++ b/gcc/config/i386/arm_neon.h
-@@ -0,0 +1,16622 @@
-+//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com
-+
-+//*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved.
-+
-+//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-+
-+//By downloading, copying, installing or using the software you agree to this license.
-+//If you do not agree to this license, do not download, install, copy or use the software.
-+
-+// License Agreement
-+
-+//Permission to use, copy, modify, and/or distribute this software for any
-+//purpose with or without fee is hereby granted, provided that the above
-+//copyright notice and this permission notice appear in all copies.
-+
-+//THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-+//REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
-+//AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-+//INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
-+//LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
-+//OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-+//PERFORMANCE OF THIS SOFTWARE.
-+
-+//*****************************************************************************************
-+// This file is intended to simplify ARM->IA32 porting
-+// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
-+// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
-+// MMX instruction set is not used due to performance overhead and the necessity to use the
-+// EMMS instruction (_mm_empty())for mmx-x87 floating point switching
-+//*****************************************************************************************
-+
-+//!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
-+//!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for
-+//!!!!!!! greater performance. It can be done by -msse4.2 compiler switch.
-+
-+#ifndef NEON2SSE_H
-+#define NEON2SSE_H
-+
-+#ifndef USE_SSE4
-+#if defined(__SSE4_2__)
-+ #define USE_SSE4
-+#endif
-+#endif
-+
-+#include <xmmintrin.h> //SSE
-+#include <emmintrin.h> //SSE2
-+#include <pmmintrin.h> //SSE3
-+#include <tmmintrin.h> //SSSE3
-+#ifdef USE_SSE4
-+#include <smmintrin.h> //SSE4.1
-+#include <nmmintrin.h> //SSE4.2
-+#endif
-+
-+
-+//*************** functions and data attributes, compiler dependent *********************************
-+//***********************************************************************************
-+#ifdef __GNUC__
-+#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-+#define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
-+#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-+#if _GCC_VERSION < 40500
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
-+#else
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
-+#endif
-+#if defined(__x86_64__)
-+ #define _NEON2SSE_64BIT __x86_64__
-+#endif
-+#else
-+#define _NEON2SSE_ALIGN_16 __declspec(align(16))
-+#define _NEON2SSE_INLINE __inline
-+#if defined(_MSC_VER)|| defined (__INTEL_COMPILER)
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
-+#if defined(_M_X64)
-+ #define _NEON2SSE_64BIT _M_X64
-+#endif
-+#else
-+ #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
-+#endif
-+#endif
-+
-+#if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
-+ #define _NEON2SSE_64BIT_SSE4
-+#endif
-+
-+/*********************************************************************************************************************/
-+// data types conversion
-+/*********************************************************************************************************************/
-+#if defined(_MSC_VER) && (_MSC_VER < 1300)
-+ typedef signed char int8_t;
-+ typedef unsigned char uint8_t;
-+ typedef signed short int16_t;
-+ typedef unsigned short uint16_t;
-+ typedef signed int int32_t;
-+ typedef unsigned int uint32_t;
-+ typedef signed long long int64_t;
-+ typedef unsigned long long uint64_t;
-+#elif defined(_MSC_VER)
-+ typedef signed __int8 int8_t;
-+ typedef unsigned __int8 uint8_t;
-+ typedef signed __int16 int16_t;
-+ typedef unsigned __int16 uint16_t;
-+ typedef signed __int32 int32_t;
-+ typedef unsigned __int32 uint32_t;
-+
-+ typedef signed long long int64_t;
-+ typedef unsigned long long uint64_t;
-+#else
-+#include <stdint.h>
-+#include <limits.h>
-+#endif
-+
-+typedef union __m64_128 {
-+ uint64_t m64_u64[1];
-+ float m64_f32[2];
-+ int8_t m64_i8[8];
-+ int16_t m64_i16[4];
-+ int32_t m64_i32[2];
-+ int64_t m64_i64[1];
-+ uint8_t m64_u8[8];
-+ uint16_t m64_u16[4];
-+ uint32_t m64_u32[2];
-+} __m64_128;
-+
-+typedef __m64_128 int8x8_t;
-+typedef __m64_128 uint8x8_t;
-+typedef __m64_128 int16x4_t;
-+typedef __m64_128 uint16x4_t;
-+typedef __m64_128 int32x2_t;
-+typedef __m64_128 uint32x2_t;
-+typedef __m64_128 int64x1_t;
-+typedef __m64_128 uint64x1_t;
-+typedef __m64_128 poly8x8_t;
-+typedef __m64_128 poly16x4_t;
-+
-+typedef __m64_128 float32x2_t;
-+typedef __m128 float32x4_t;
-+
-+typedef __m128 float16x4_t; //not supported by IA, for compatibility
-+typedef __m128 float16x8_t; //not supported by IA, for compatibility
-+
-+typedef __m128i int8x16_t;
-+typedef __m128i int16x8_t;
-+typedef __m128i int32x4_t;
-+typedef __m128i int64x2_t;
-+typedef __m128i uint8x16_t;
-+typedef __m128i uint16x8_t;
-+typedef __m128i uint32x4_t;
-+typedef __m128i uint64x2_t;
-+typedef __m128i poly8x16_t;
-+typedef __m128i poly16x8_t;
-+
-+#if defined(_MSC_VER)
-+ #define SINT_MIN (-2147483647 - 1) /* min signed int value */
-+ #define SINT_MAX 2147483647 /* max signed int value */
-+#else
-+ #define SINT_MIN INT_MIN /* min signed int value */
-+ #define SINT_MAX INT_MAX /* max signed int value */
-+#endif
-+
-+typedef float float32_t;
-+#if !defined(__clang__)
-+typedef float __fp16;
-+#endif
-+
-+typedef uint8_t poly8_t;
-+typedef uint16_t poly16_t;
-+
-+
-+//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
-+//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
-+struct int8x16x2_t {
-+ int8x16_t val[2];
-+};
-+struct int16x8x2_t {
-+ int16x8_t val[2];
-+};
-+struct int32x4x2_t {
-+ int32x4_t val[2];
-+};
-+struct int64x2x2_t {
-+ int64x2_t val[2];
-+};
-+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
-+struct int8x8x2_t {
-+ int8x8_t val[2];
-+};
-+struct int16x4x2_t {
-+ int16x4_t val[2];
-+};
-+struct int32x2x2_t {
-+ int32x2_t val[2];
-+};
-+struct int64x1x2_t {
-+ int64x1_t val[2];
-+};
-+
-+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
-+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
-+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
-+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
-+
-+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
-+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
-+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
-+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
-+
-+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
-+typedef struct int8x16x2_t uint8x16x2_t;
-+typedef struct int16x8x2_t uint16x8x2_t;
-+typedef struct int32x4x2_t uint32x4x2_t;
-+typedef struct int64x2x2_t uint64x2x2_t;
-+typedef struct int8x16x2_t poly8x16x2_t;
-+typedef struct int16x8x2_t poly16x8x2_t;
-+
-+typedef struct int8x8x2_t uint8x8x2_t;
-+typedef struct int16x4x2_t uint16x4x2_t;
-+typedef struct int32x2x2_t uint32x2x2_t;
-+typedef struct int64x1x2_t uint64x1x2_t;
-+typedef struct int8x8x2_t poly8x8x2_t;
-+typedef struct int16x4x2_t poly16x4x2_t;
-+
-+//float
-+struct float32x4x2_t {
-+ float32x4_t val[2];
-+};
-+struct float16x8x2_t {
-+ float16x8_t val[2];
-+};
-+struct float32x2x2_t {
-+ float32x2_t val[2];
-+};
-+
-+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
-+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
-+typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy
-+typedef float16x8x2_t float16x4x2_t;
-+
-+//4
-+struct int8x16x4_t {
-+ int8x16_t val[4];
-+};
-+struct int16x8x4_t {
-+ int16x8_t val[4];
-+};
-+struct int32x4x4_t {
-+ int32x4_t val[4];
-+};
-+struct int64x2x4_t {
-+ int64x2_t val[4];
-+};
-+
-+struct int8x8x4_t {
-+ int8x8_t val[4];
-+};
-+struct int16x4x4_t {
-+ int16x4_t val[4];
-+};
-+struct int32x2x4_t {
-+ int32x2_t val[4];
-+};
-+struct int64x1x4_t {
-+ int64x1_t val[4];
-+};
-+
-+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
-+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
-+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
-+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
-+
-+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
-+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
-+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
-+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
-+
-+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-+typedef struct int8x8x4_t uint8x8x4_t;
-+typedef struct int16x4x4_t uint16x4x4_t;
-+typedef struct int32x2x4_t uint32x2x4_t;
-+typedef struct int64x1x4_t uint64x1x4_t;
-+typedef struct int8x8x4_t poly8x8x4_t;
-+typedef struct int16x4x4_t poly16x4x4_t;
-+
-+typedef struct int8x16x4_t uint8x16x4_t;
-+typedef struct int16x8x4_t uint16x8x4_t;
-+typedef struct int32x4x4_t uint32x4x4_t;
-+typedef struct int64x2x4_t uint64x2x4_t;
-+typedef struct int8x16x4_t poly8x16x4_t;
-+typedef struct int16x8x4_t poly16x8x4_t;
-+
-+struct float32x4x4_t {
-+ float32x4_t val[4];
-+};
-+struct float16x8x4_t {
-+ float16x8_t val[4];
-+};
-+struct float32x2x4_t {
-+ float32x2_t val[4];
-+};
-+
-+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
-+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
-+typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy
-+typedef float16x8x4_t float16x4x4_t;
-+
-+//3
-+struct int16x8x3_t {
-+ int16x8_t val[3];
-+};
-+struct int32x4x3_t {
-+ int32x4_t val[3];
-+};
-+struct int64x2x3_t {
-+ int64x2_t val[3];
-+};
-+struct int8x16x3_t {
-+ int8x16_t val[3];
-+};
-+
-+struct int16x4x3_t {
-+ int16x4_t val[3];
-+};
-+struct int32x2x3_t {
-+ int32x2_t val[3];
-+};
-+struct int64x1x3_t {
-+ int64x1_t val[3];
-+};
-+struct int8x8x3_t {
-+ int8x8_t val[3];
-+};
-+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
-+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
-+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
-+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
-+
-+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
-+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
-+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
-+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
-+
-+
-+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-+typedef struct int8x16x3_t uint8x16x3_t;
-+typedef struct int16x8x3_t uint16x8x3_t;
-+typedef struct int32x4x3_t uint32x4x3_t;
-+typedef struct int64x2x3_t uint64x2x3_t;
-+typedef struct int8x16x3_t poly8x16x3_t;
-+typedef struct int16x8x3_t poly16x8x3_t;
-+typedef struct int8x8x3_t uint8x8x3_t;
-+typedef struct int16x4x3_t uint16x4x3_t;
-+typedef struct int32x2x3_t uint32x2x3_t;
-+typedef struct int64x1x3_t uint64x1x3_t;
-+typedef struct int8x8x3_t poly8x8x3_t;
-+typedef struct int16x4x3_t poly16x4x3_t;
-+
-+//float
-+struct float32x4x3_t {
-+ float32x4_t val[3];
-+};
-+struct float32x2x3_t {
-+ float32x2_t val[3];
-+};
-+struct float16x8x3_t {
-+ float16x8_t val[3];
-+};
-+
-+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
-+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
-+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
-+typedef float16x8x3_t float16x4x3_t;
-+
-+
-+//****************************************************************************
-+//****** Porting auxiliary macros ********************************************
-+
-+//** floating point related macros **
-+#define _M128i(a) _mm_castps_si128(a)
-+#define _M128(a) _mm_castsi128_ps(a)
-+//here the most performance effective implementation is compiler and 32/64 bits build dependent
-+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
-+
-+ #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
-+ #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
-+ #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
-+#else
-+ //for 32bit gcc and Microsoft compilers builds
-+ #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
-+ #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
-+ #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
-+#endif
-+#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
-+
-+#define return64(a) _M64(res64,a); return res64;
-+#define return64f(a) _M64f(res64,a); return res64;
-+
-+#define _Ui64(a) (*(uint64_t*)&(a))
-+#define _UNSIGNED_T(a) u ## a
-+
-+#define _SIGNBIT64 ((uint64_t)1 << 63)
-+#define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6))
-+#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
-+
-+#define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
-+#define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
-+
-+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+#define __constrange(min,max) const
-+#define __transfersize(size)
-+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+
-+
-+//*************************************************************************
-+//*************************************************************************
-+//********* Functions declarations as declared in original arm_neon.h *****
-+//*************************************************************************
-+//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
-+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
-+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
-+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
-+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
-+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
-+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
-+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
-+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
-+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
-+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
-+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
-+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
-+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
-+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
-+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
-+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
-+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
-+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
-+//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
-+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
-+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
-+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
-+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
-+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
-+//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
-+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
-+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
-+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
-+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
-+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
-+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
-+//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
-+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
-+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
-+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
-+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
-+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
-+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
-+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
-+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
-+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
-+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
-+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
-+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
-+//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
-+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
-+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
-+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
-+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
-+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
-+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
-+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
-+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
-+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
-+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
-+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
-+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
-+//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
-+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
-+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
-+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
-+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
-+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
-+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
-+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
-+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
-+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
-+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
-+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
-+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
-+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
-+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
-+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
-+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
-+//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
-+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
-+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
-+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
-+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
-+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
-+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
-+//Vector rounding add high half: vraddhn
-+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
-+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
-+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
-+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
-+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
-+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
-+//Multiplication
-+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
-+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
-+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
-+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
-+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
-+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
-+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
-+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
-+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
-+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
-+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
-+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
-+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
-+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
-+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
-+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
-+//multiply lane
-+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
-+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
-+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
-+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
-+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
-+int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
-+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
-+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
-+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
-+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
-+//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
-+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
-+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
-+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
-+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
-+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
-+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
-+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
-+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
-+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
-+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
-+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
-+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
-+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
-+//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
-+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
-+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
-+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
-+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
-+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
-+//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
-+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
-+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
-+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
-+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
-+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
-+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
-+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
-+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
-+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
-+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
-+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
-+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
-+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
-+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
-+//Vector multiply subtract long
-+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
-+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
-+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
-+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
-+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
-+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
-+//Vector saturating doubling multiply high
-+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
-+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
-+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
-+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
-+//Vector saturating rounding doubling multiply high
-+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
-+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
-+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
-+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
-+//Vector saturating doubling multiply accumulate long
-+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
-+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
-+//Vector saturating doubling multiply subtract long
-+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
-+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
-+//Vector long multiply
-+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
-+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
-+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
-+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
-+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
-+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
-+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
-+//Vector saturating doubling long multiply
-+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
-+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
-+//Subtraction
-+//Vector subtract
-+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
-+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
-+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
-+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
-+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
-+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
-+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
-+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
-+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
-+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
-+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
-+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
-+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
-+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
-+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
-+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
-+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
-+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
-+//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
-+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
-+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
-+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
-+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
-+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
-+//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
-+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
-+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
-+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
-+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
-+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
-+//Vector saturating subtract
-+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
-+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
-+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
-+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
-+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
-+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
-+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
-+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
-+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
-+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
-+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
-+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
-+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
-+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
-+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
-+uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
-+//Vector halving subtract
-+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
-+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
-+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
-+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
-+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
-+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
-+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
-+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
-+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
-+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
-+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
-+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
-+//Vector subtract high half
-+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
-+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
-+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
-+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
-+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
-+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
-+//Vector rounding subtract high half
-+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+//Comparison
-+//Vector compare equal
-+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
-+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
-+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
-+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
-+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
-+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
-+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
-+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
-+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
-+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
-+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
-+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
-+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
-+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
-+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
-+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
-+//Vector compare greater-than or equal
-+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
-+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
-+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+//Vector compare less-than or equal
-+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
-+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
-+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+//Vector compare greater-than
-+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
-+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
-+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+//Vector compare less-than
-+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
-+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
-+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+//Vector compare absolute greater-than or equal
-+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+//Vector compare absolute less-than or equal
-+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+//Vector compare absolute greater-than
-+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+//Vector compare absolute less-than
-+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+//Vector test bits
-+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
-+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
-+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
-+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
-+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
-+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
-+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
-+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
-+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
-+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
-+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
-+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
-+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
-+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
-+//Absolute difference
-+//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
-+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
-+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
-+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
-+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
-+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
-+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
-+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
-+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
-+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
-+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
-+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
-+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
-+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
-+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
-+//Absolute difference - long
-+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
-+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
-+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
-+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
-+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
-+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
-+//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
-+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
-+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
-+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
-+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
-+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
-+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
-+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
-+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
-+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
-+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
-+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
-+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
-+//Absolute difference and accumulate - long
-+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
-+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
-+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
-+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
-+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
-+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
-+//Max/Min
-+//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
-+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
-+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
-+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
-+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
-+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
-+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
-+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
-+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
-+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
-+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
-+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
-+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
-+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
-+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
-+//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
-+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
-+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
-+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
-+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
-+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
-+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
-+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
-+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
-+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
-+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
-+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
-+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
-+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
-+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
-+//Pairwise addition
-+//Pairwise add
-+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
-+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
-+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
-+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
-+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
-+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
-+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
-+//Long pairwise add
-+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
-+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
-+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
-+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
-+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
-+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
-+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
-+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
-+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
-+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
-+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
-+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-+//Long pairwise add and accumulate
-+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
-+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
-+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
-+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
-+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
-+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
-+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
-+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
-+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
-+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
-+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
-+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
-+//Folding maximum vpmax -> takes maximum of adjacent pairs
-+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
-+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
-+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
-+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
-+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
-+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
-+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
-+//Folding minimum vpmin -> takes minimum of adjacent pairs
-+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
-+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
-+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
-+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
-+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
-+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
-+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
-+//Reciprocal/Sqrt
-+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
-+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
-+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
-+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
-+//Shifts by signed variable
-+//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
-+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
-+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
-+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
-+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
-+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
-+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
-+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
-+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
-+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
-+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
-+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
-+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
-+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
-+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
-+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
-+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
-+//Vector saturating shift left: (negative values shift right)
-+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
-+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
-+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
-+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
-+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
-+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
-+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
-+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
-+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
-+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
-+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
-+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
-+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
-+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
-+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
-+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
-+//Vector rounding shift left: (negative values shift right)
-+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
-+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
-+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
-+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
-+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
-+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
-+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
-+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
-+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
-+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
-+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
-+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
-+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
-+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
-+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
-+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
-+//Vector saturating rounding shift left: (negative values shift right)
-+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
-+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
-+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
-+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
-+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
-+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
-+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
-+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
-+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
-+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
-+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
-+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
-+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
-+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
-+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
-+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
-+//Shifts by a constant
-+//Vector shift right by constant
-+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
-+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
-+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
-+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
-+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
-+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
-+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
-+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
-+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
-+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
-+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
-+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
-+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
-+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
-+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
-+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
-+//Vector shift left by constant
-+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+//Vector rounding shift right by constant
-+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
-+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
-+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
-+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
-+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
-+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
-+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
-+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
-+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
-+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
-+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
-+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
-+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
-+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
-+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
-+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
-+//Vector shift right by constant and accumulate
-+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
-+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
-+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
-+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
-+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
-+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
-+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
-+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
-+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
-+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
-+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
-+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
-+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
-+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
-+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
-+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
-+//Vector rounding shift right by constant and accumulate
-+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
-+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
-+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
-+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
-+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
-+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
-+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
-+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
-+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
-+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
-+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
-+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
-+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
-+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
-+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
-+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
-+//Vector saturating shift left by constant
-+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
-+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
-+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
-+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
-+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
-+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
-+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
-+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
-+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
-+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
-+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
-+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
-+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
-+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
-+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
-+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
-+//Vector signed->unsigned saturating shift left by constant
-+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
-+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
-+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
-+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
-+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
-+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
-+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
-+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
-+//Vector narrowing shift right by constant
-+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+//Vector signed->unsigned narrowing saturating shift right by constant
-+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
-+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
-+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
-+//Vector signed->unsigned rounding narrowing saturating shift right by constant
-+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
-+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
-+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
-+//Vector narrowing saturating shift right by constant
-+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
-+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
-+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
-+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
-+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
-+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
-+//Vector rounding narrowing shift right by constant
-+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+//Vector rounding narrowing saturating shift right by constant
-+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
-+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
-+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
-+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
-+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
-+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
-+//Vector widening shift left by constant
-+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
-+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
-+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
-+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
-+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
-+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
-+//Shifts with insert
-+//Vector shift right and insert
-+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+//Vector shift left and insert
-+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
-+//Load a single vector from memory
-+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
-+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
-+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
-+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
-+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
-+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
-+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
-+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
-+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
-+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
-+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
-+//Load a single lane from memory
-+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
-+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
-+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
-+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
-+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
-+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
-+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
-+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
-+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
-+float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
-+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
-+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-+//Load all lanes of vector with same value from memory
-+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+//Store a single vector or lane. Stores all lanes or a single lane of a vector.
-+//Store a single vector into memory
-+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
-+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
-+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
-+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
-+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
-+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
-+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
-+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
-+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
-+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
-+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
-+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
-+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
-+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
-+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
-+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
-+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
-+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
-+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
-+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
-+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
-+//Store a lane of a vector into memory
-+//Loads of an N-element structure
-+//Load N-element structure from memory
-+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
-+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
-+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+//Load all lanes of N-element structure with same value from memory
-+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+//Load a single lane of N-element structure from memory
-+//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
-+uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-+int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-+float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-+poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-+uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-+uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
-+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
-+//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+//Store N-element structure to memory
-+void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
-+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
-+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
-+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
-+void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
-+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
-+void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
-+void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
-+//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
-+void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
-+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
-+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
-+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
-+void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
-+void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
-+void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
-+void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
-+void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
-+void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
-+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
-+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
-+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
-+void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
-+void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
-+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
-+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
-+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-+//Store a single lane of N-element structure to memory
-+void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
-+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
-+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
-+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
-+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-+void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-+void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
-+void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-+void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-+void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
-+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-+void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-+//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
-+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
-+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
-+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
-+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
-+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
-+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
-+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
-+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
-+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
-+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+//Initialize a vector from a literal bit pattern.
-+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
-+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
-+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
-+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
-+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
-+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
-+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
-+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
-+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
-+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
-+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
-+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
-+//Set all lanes to same value
-+//Load all lanes of vector to the same literal value
-+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
-+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
-+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
-+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
-+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
-+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
-+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
-+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
-+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
-+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
-+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
-+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
-+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
-+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
-+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
-+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
-+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
-+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
-+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
-+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
-+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
-+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
-+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
-+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
-+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
-+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
-+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
-+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
-+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
-+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+//Load all lanes of the vector to the value of a lane of a vector
-+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
-+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
-+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
-+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
-+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
-+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
-+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
-+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
-+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
-+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
-+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
-+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
-+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
-+//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
-+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
-+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
-+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
-+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
-+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
-+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
-+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
-+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
-+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
-+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
-+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
-+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
-+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
-+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
-+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
-+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
-+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
-+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
-+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
-+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
-+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
-+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
-+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
-+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
-+//Converting vectors. These intrinsics are used to convert vectors.
-+//Convert from float
-+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
-+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
-+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
-+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
-+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
-+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
-+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
-+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
-+//Convert to float
-+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
-+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
-+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
-+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
-+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
-+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
-+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
-+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
-+//Convert between floats
-+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
-+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
-+//Vector narrow integer
-+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
-+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
-+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
-+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
-+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
-+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
-+//Vector long move
-+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
-+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
-+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
-+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
-+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
-+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
-+//Vector saturating narrow integer
-+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
-+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
-+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
-+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
-+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
-+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
-+//Vector saturating narrow integer signed->unsigned
-+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
-+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
-+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
-+//Table look up
-+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
-+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+//Extended table look up intrinsics
-+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
-+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+//Operations with a scalar value
-+//Vector multiply accumulate with scalar
-+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
-+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
-+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
-+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
-+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
-+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
-+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
-+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
-+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
-+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
-+//Vector widening multiply accumulate with scalar
-+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
-+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
-+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
-+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
-+//Vector widening saturating doubling multiply accumulate with scalar
-+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
-+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
-+//Vector multiply subtract with scalar
-+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
-+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
-+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
-+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
-+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
-+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
-+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
-+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
-+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
-+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
-+//Vector widening multiply subtract with scalar
-+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
-+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
-+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
-+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
-+//Vector widening saturating doubling multiply subtract with scalar
-+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
-+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
-+//Vector multiply by scalar
-+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
-+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
-+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
-+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
-+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
-+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
-+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
-+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
-+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
-+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
-+//Vector long multiply with scalar
-+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
-+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
-+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
-+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
-+//Vector long multiply by scalar
-+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
-+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
-+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
-+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
-+//Vector saturating doubling long multiply with scalar
-+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
-+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
-+//Vector saturating doubling long multiply by scalar
-+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
-+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
-+//Vector saturating doubling multiply high with scalar
-+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
-+//Vector saturating doubling multiply high by scalar
-+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
-+//Vector saturating rounding doubling multiply high with scalar
-+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
-+//Vector rounding saturating doubling multiply high by scalar
-+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
-+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
-+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
-+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
-+//Vector multiply accumulate with scalar
-+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
-+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
-+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
-+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
-+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
-+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
-+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
-+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
-+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
-+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
-+//Vector widening multiply accumulate with scalar
-+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
-+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
-+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
-+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
-+//Vector widening saturating doubling multiply accumulate with scalar
-+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
-+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
-+//Vector multiply subtract with scalar
-+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
-+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
-+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
-+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
-+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
-+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
-+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
-+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
-+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
-+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
-+//Vector widening multiply subtract with scalar
-+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
-+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
-+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
-+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
-+//Vector widening saturating doubling multiply subtract with scalar
-+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
-+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
-+//Vector extract
-+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
-+//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
-+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
-+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
-+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
-+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
-+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
-+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
-+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
-+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
-+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
-+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
-+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
-+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
-+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
-+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
-+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
-+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
-+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
-+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
-+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
-+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
-+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
-+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
-+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
-+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
-+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
-+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
-+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
-+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
-+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
-+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
-+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
-+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
-+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
-+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
-+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
-+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
-+//Other single operand arithmetic
-+//Absolute: Vd[i] = |Va[i]|
-+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
-+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
-+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
-+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
-+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
-+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
-+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
-+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
-+//Saturating absolute: Vd[i] = sat(|Va[i]|)
-+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
-+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
-+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
-+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
-+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
-+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
-+//Negate: Vd[i] = - Va[i]
-+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
-+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
-+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
-+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
-+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
-+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
-+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
-+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
-+//Saturating Negate: sat(Vd[i] = - Va[i])
-+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
-+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
-+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
-+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
-+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
-+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
-+//Count leading sign bits
-+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
-+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
-+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
-+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
-+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
-+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
-+//Count leading zeros
-+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
-+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
-+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
-+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
-+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
-+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
-+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
-+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
-+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
-+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
-+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
-+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
-+//Count number of set bits
-+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
-+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
-+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
-+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
-+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
-+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
-+//Reciprocal estimate
-+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
-+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
-+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
-+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
-+//Reciprocal square root estimate
-+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
-+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
-+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
-+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
-+//Logical operations
-+//Bitwise not
-+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
-+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
-+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
-+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
-+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
-+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
-+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
-+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
-+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
-+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
-+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
-+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
-+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
-+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
-+//Bitwise and
-+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
-+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
-+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
-+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
-+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
-+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
-+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
-+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
-+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
-+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
-+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
-+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
-+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
-+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
-+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
-+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
-+//Bitwise or
-+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
-+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
-+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
-+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
-+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
-+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
-+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
-+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
-+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
-+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
-+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
-+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
-+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
-+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
-+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
-+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
-+//Bitwise exclusive or (EOR or XOR)
-+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
-+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
-+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
-+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
-+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
-+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
-+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
-+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
-+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
-+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
-+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
-+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
-+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
-+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
-+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
-+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
-+//Bit Clear
-+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
-+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
-+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
-+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
-+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
-+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
-+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
-+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
-+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
-+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
-+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
-+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
-+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
-+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
-+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
-+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
-+//Bitwise OR complement
-+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
-+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
-+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
-+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
-+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
-+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
-+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
-+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
-+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
-+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
-+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
-+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
-+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
-+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
-+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
-+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
-+//Bitwise Select
-+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
-+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
-+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
-+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
-+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
-+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
-+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
-+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
-+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
-+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
-+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
-+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
-+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
-+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
-+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
-+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
-+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
-+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
-+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
-+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
-+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
-+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
-+//Transposition operations
-+//Transpose elements
-+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
-+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
-+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
-+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
-+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
-+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
-+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
-+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
-+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
-+int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
-+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
-+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
-+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
-+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
-+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
-+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
-+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
-+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
-+//Interleave elements
-+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
-+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
-+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
-+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
-+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
-+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
-+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
-+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
-+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
-+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
-+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
-+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
-+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
-+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
-+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
-+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
-+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
-+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
-+//De-Interleave elements
-+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
-+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
-+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
-+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
-+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
-+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
-+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
-+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
-+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
-+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
-+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
-+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
-+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
-+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
-+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
-+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
-+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
-+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
-+
-+
-+//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-+// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
-+//for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
-+//
-+#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
-+
-+ #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
-+
-+ #define _MM_EXTRACT_EPI16 _mm_extract_epi16
-+ #define _MM_INSERT_EPI16 _mm_insert_epi16
-+#ifdef USE_SSE4
-+ #define _MM_EXTRACT_EPI8 _mm_extract_epi8
-+ #define _MM_EXTRACT_EPI32 _mm_extract_epi32
-+ #define _MM_EXTRACT_PS _mm_extract_ps
-+
-+ #define _MM_INSERT_EPI8 _mm_insert_epi8
-+ #define _MM_INSERT_EPI32 _mm_insert_epi32
-+ #define _MM_INSERT_PS _mm_insert_ps
-+#ifdef _NEON2SSE_64BIT
-+ #define _MM_INSERT_EPI64 _mm_insert_epi64
-+ #define _MM_EXTRACT_EPI64 _mm_extract_epi64
-+#endif
-+#endif //SSE4
-+#else
-+ #define _NEON2SSE_COMMA ,
-+ #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
-+ switch(LANE) \
-+ { \
-+ case 0: return NAME(a b, 0); \
-+ case 1: return NAME(a b, 1); \
-+ case 2: return NAME(a b, 2); \
-+ case 3: return NAME(a b, 3); \
-+ case 4: return NAME(a b, 4); \
-+ case 5: return NAME(a b, 5); \
-+ case 6: return NAME(a b, 6); \
-+ case 7: return NAME(a b, 7); \
-+ case 8: return NAME(a b, 8); \
-+ case 9: return NAME(a b, 9); \
-+ case 10: return NAME(a b, 10); \
-+ case 11: return NAME(a b, 11); \
-+ case 12: return NAME(a b, 12); \
-+ case 13: return NAME(a b, 13); \
-+ case 14: return NAME(a b, 14); \
-+ case 15: return NAME(a b, 15); \
-+ default: return NAME(a b, 0); \
-+ }
-+
-+ #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
-+ switch(LANE) \
-+ { \
-+ case 0: return NAME(vec p,0); \
-+ case 1: return NAME(vec p,1); \
-+ case 2: return NAME(vec p,2); \
-+ case 3: return NAME(vec p,3); \
-+ case 4: return NAME(vec p,4); \
-+ case 5: return NAME(vec p,5); \
-+ case 6: return NAME(vec p,6); \
-+ case 7: return NAME(vec p,7); \
-+ default: return NAME(vec p,0); \
-+ }
-+
-+ #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
-+ switch(LANE) \
-+ { \
-+ case case0: return NAME(vec p,case0); \
-+ case case1: return NAME(vec p,case1); \
-+ case case2: return NAME(vec p,case2); \
-+ case case3: return NAME(vec p,case3); \
-+ default: return NAME(vec p,case0); \
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
-+ {
-+ _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
-+ }
-+
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
-+ }
-+
-+#ifdef _NEON2SSE_64BIT
-+ //the special case of functions available only for SSE4 and 64-bit build.
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
-+ {
-+ switch(LANE) {
-+ case 0:
-+ return _mm_insert_epi64(vec, p, 0);
-+ case 1:
-+ return _mm_insert_epi64(vec, p, 1);
-+ default:
-+ return _mm_insert_epi64(vec, p, 0);
-+ }
-+ }
-+
-+ _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
-+ {
-+ if (LANE ==0) return _mm_extract_epi64(val, 0);
-+ else return _mm_extract_epi64(val, 1);
-+ }
-+#endif
-+
-+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
-+ {
-+ _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
-+ }
-+
-+#endif //USE_SSE4
-+
-+#endif //#ifdef NDEBUG
-+
-+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
-+// or for some specific commonly used operations implementation missing in SSE
-+#ifdef USE_SSE4
-+ #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
-+ #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
-+ #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
-+
-+ #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
-+ #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
-+ #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
-+
-+ #define _MM_MAX_EPI8 _mm_max_epi8
-+ #define _MM_MAX_EPI32 _mm_max_epi32
-+ #define _MM_MAX_EPU16 _mm_max_epu16
-+ #define _MM_MAX_EPU32 _mm_max_epu32
-+
-+ #define _MM_MIN_EPI8 _mm_min_epi8
-+ #define _MM_MIN_EPI32 _mm_min_epi32
-+ #define _MM_MIN_EPU16 _mm_min_epu16
-+ #define _MM_MIN_EPU32 _mm_min_epu32
-+
-+ #define _MM_BLENDV_EPI8 _mm_blendv_epi8
-+ #define _MM_PACKUS_EPI32 _mm_packus_epi32
-+ #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
-+
-+ #define _MM_MULLO_EPI32 _mm_mullo_epi32
-+ #define _MM_MUL_EPI32 _mm_mul_epi32
-+
-+ #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
-+#else //no SSE4 !!!!!!
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ return _mm_unpacklo_epi8(a, zero);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ return _mm_unpacklo_epi16(a, zero);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ return _mm_unpacklo_epi32(a, zero);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ __m128i sign = _mm_cmpgt_epi8(zero, a);
-+ return _mm_unpacklo_epi8(a, sign);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ __m128i sign = _mm_cmpgt_epi16(zero, a);
-+ return _mm_unpacklo_epi16(a, sign);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
-+ {
-+ __m128i zero = _mm_setzero_si128();
-+ __m128i sign = _mm_cmpgt_epi32(zero, a);
-+ return _mm_unpacklo_epi32(a, sign);
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t tmp[4];
-+ _mm_store_si128((__m128i*)tmp, vec);
-+ return tmp[LANE];
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t tmp[16];
-+ _mm_store_si128((__m128i*)tmp, vec);
-+ return (int)tmp[LANE];
-+ }
-+
-+ _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t tmp[4];
-+ _mm_store_si128((__m128i*)tmp, _M128i(vec));
-+ return tmp[LANE];
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
-+ _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
-+ __m128i vec_masked, p_masked;
-+ pvec[LANE] = p;
-+ mask[LANE] = 0x0;
-+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-+ return _mm_or_si128(vec_masked, p_masked);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
-+ _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
-+ __m128i vec_masked, p_masked;
-+ pvec[LANE] = (int8_t)p;
-+ mask[LANE] = 0x0;
-+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-+ return _mm_or_si128(vec_masked, p_masked);
-+ }
-+
-+ _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
-+ __m128 tmp, vec_masked, p_masked;
-+ mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
-+ vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
-+ tmp = _mm_or_ps(vec_masked, p_masked);
-+ return tmp;
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi8 (a, b);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi32(a, b);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
-+ {
-+ __m128i c8000, b_s, a_s, cmp;
-+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
-+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
-+ b_s = _mm_sub_epi16 (b, c8000);
-+ a_s = _mm_sub_epi16 (a, c8000);
-+ cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
-+ {
-+ __m128i c80000000, b_s, a_s, cmp;
-+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
-+ b_s = _mm_sub_epi32 (b, c80000000);
-+ a_s = _mm_sub_epi32 (a, c80000000);
-+ cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi8 (b, a);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
-+ {
-+ __m128i cmp, resa, resb;
-+ cmp = _mm_cmpgt_epi32(b, a);
-+ resa = _mm_and_si128 (cmp, a);
-+ resb = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(resa, resb);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
-+ {
-+ __m128i c8000, b_s, a_s, cmp;
-+ c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
-+ c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
-+ b_s = _mm_sub_epi16 (b, c8000);
-+ a_s = _mm_sub_epi16 (a, c8000);
-+ cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
-+ {
-+ __m128i c80000000, b_s, a_s, cmp;
-+ c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-+ c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
-+ b_s = _mm_sub_epi32 (b, c80000000);
-+ a_s = _mm_sub_epi32 (a, c80000000);
-+ cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
-+ a_s = _mm_and_si128 (cmp,a);
-+ b_s = _mm_andnot_si128 (cmp,b);
-+ return _mm_or_si128(a_s, b_s);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
-+ {
-+ //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
-+ __m128i a_masked, b_masked;
-+ b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
-+ a_masked = _mm_andnot_si128 (mask,a);
-+ return _mm_or_si128(a_masked, b_masked);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
-+ __m128i a16, b16, res, reshi,cmp, zero;
-+ zero = _mm_setzero_si128();
-+ a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
-+ b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
-+ res = _mm_unpacklo_epi64(a16, b16); //result without saturation
-+ reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
-+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
-+ res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
-+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
-+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
-+ {
-+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
-+ __m128i a16, res, reshi,cmp, zero;
-+ zero = _mm_setzero_si128();
-+ a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
-+ reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
-+ cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
-+ res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
-+ cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
-+ return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
-+ }
-+
-+
-+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
-+ {
-+ _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
-+ int64_t res64;
-+ int i;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ for (i = 0; i<4; i++) {
-+ res64 = atmp[i] * btmp[i];
-+ res[i] = (int)(res64 & 0xffffffff);
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
-+ {
-+ __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg;
-+ sign = _mm_xor_si128 (a, b);
-+ sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
-+ zero = _mm_setzero_si128();
-+ a_neg = _mm_abs_epi32 (a); //negate a and b
-+ b_neg = _mm_abs_epi32 (b); //negate a and b
-+ mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
-+ mul_us_neg = _mm_sub_epi64(zero, mul_us);
-+ mul_us_neg = _mm_and_si128(sign, mul_us_neg);
-+ mul_us = _mm_andnot_si128(sign, mul_us);
-+ return _mm_or_si128 (mul_us, mul_us_neg);
-+ }
-+
-+ _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
-+ {
-+ __m128i res;
-+ res = _mm_cmpeq_epi32 (a, b);
-+ return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
-+ }
-+#endif //SSE4
-+
-+//the special case of functions working only for 32 bits, no SSE4
-+_NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
-+ _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
-+ __m128i vec_masked, p_masked;
-+ pvec[LANE] = p;
-+ mask[LANE] = 0x0;
-+ vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-+ p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-+ return _mm_or_si128(vec_masked, p_masked);
-+}
-+
-+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
-+{
-+ _NEON2SSE_ALIGN_16 int64_t tmp[2];
-+ _mm_store_si128((__m128i*)tmp, val);
-+ return tmp[LANE];
-+}
-+
-+#ifndef _NEON2SSE_64BIT_SSE4
-+ #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
-+ #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
-+#endif
-+
-+int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints
-+_NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a)
-+{
-+ //Overflow happens only if a and sum have the opposite signs
-+ __m128i c7fffffff, res, res_sat, res_xor_a;
-+ c7fffffff = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_slli_epi32 (a, 1); // res = a*2
-+ res_sat = _mm_srli_epi32(a, 31);
-+ res_sat = _mm_add_epi32(res_sat, c7fffffff);
-+ res_xor_a = _mm_xor_si128(res, a);
-+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-+ res_sat = _mm_and_si128(res_xor_a, res_sat);
-+ res = _mm_andnot_si128(res_xor_a, res);
-+ return _mm_or_si128(res, res_sat);
-+}
-+
-+
-+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+//*************************************************************************
-+//*************************************************************************
-+//***************** Functions redefinition\implementatin starts here *****
-+//*************************************************************************
-+//*************************************************************************
-+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+
-+/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
-+#ifdef ARM
-+#define vector_addq_s32 _mm_add_epi32
-+#else //if we have IA
-+#define vector_addq_s32 vadd_s32
-+#endif
-+
-+********************************************************************************************
-+Functions below are organised in the following way:
-+
-+Each NEON intrinsic function has one of the following options:
-+1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
-+2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
-+3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
-+4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
-+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
-+- please consider such functions removal from your code.
-+*/
-+
-+//***********************************************************************
-+//************************ Vector add *****************************
-+//***********************************************************************
-+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res64;
-+ res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
-+ return res64;
-+}
-+
-+
-+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
-+{
-+ __m128 res;
-+ __m64_128 res64;
-+ res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
-+#define vadd_u8 vadd_s8
-+
-+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
-+#define vadd_u16 vadd_s16
-+
-+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
-+#define vadd_u32 vadd_s32
-+
-+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
-+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b)
-+{
-+ uint64x1_t res64;
-+ res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
-+ return res64;
-+}
-+
-+
-+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
-+#define vaddq_s8 _mm_add_epi8
-+
-+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
-+#define vaddq_s16 _mm_add_epi16
-+
-+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
-+#define vaddq_s32 _mm_add_epi32
-+
-+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
-+#define vaddq_s64 _mm_add_epi64
-+
-+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
-+#define vaddq_f32 _mm_add_ps
-+
-+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
-+#define vaddq_u8 _mm_add_epi8
-+
-+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
-+#define vaddq_u16 _mm_add_epi16
-+
-+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
-+#define vaddq_u32 _mm_add_epi32
-+
-+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
-+#define vaddq_u64 _mm_add_epi64
-+
-+//**************************** Vector long add *****************************:
-+//***********************************************************************
-+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_add_epi16 (a16, b16);
-+}
-+
-+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi32 (a32, b32);
-+}
-+
-+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 ( a64, b64);
-+}
-+
-+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi16 (a16, b16);
-+}
-+
-+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi32 (a32, b32);
-+}
-+
-+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 (a64, b64);
-+}
-+
-+//*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
-+//*************** *********************************************************************
-+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
-+_NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_add_epi16 (a, b16);
-+}
-+
-+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
-+_NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
-+ return _mm_add_epi32 (a, b32);
-+}
-+
-+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
-+_NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 (a, b64);
-+}
-+
-+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
-+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi16 (a, b16);
-+}
-+
-+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
-+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi32 (a, b32);
-+}
-+
-+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
-+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_add_epi64 (a, b64);
-+}
-+
-+//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *******************************
-+//*************************************************************************************************************************
-+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vhaddq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = vshrq_n_s8(tmp2,1);
-+ return _mm_add_epi8(tmp1,tmp2);
-+}
-+
-+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = _mm_srai_epi16(tmp2,1);
-+ return _mm_add_epi16(tmp1,tmp2);
-+}
-+
-+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = _mm_srai_epi32(tmp2,1);
-+ return _mm_add_epi32(tmp1,tmp2);
-+}
-+
-+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
-+{
-+ __m128i c1, sum, res;
-+ c1 = _mm_set1_epi8(1);
-+ sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
-+ res = _mm_xor_si128(a, b); //for rounding compensation
-+ res = _mm_and_si128(res,c1); //for rounding compensation
-+ return _mm_sub_epi8 (sum, res); //actual rounding compensation
-+}
-+
-+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
-+{
-+ __m128i sum, res;
-+ sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
-+ res = _mm_xor_si128(a, b); //for rounding compensation
-+ res = _mm_slli_epi16 (res,15); //shift left then back right to
-+ res = _mm_srli_epi16 (res,15); //get 1 or zero
-+ return _mm_sub_epi16 (sum, res); //actual rounding compensation
-+}
-+
-+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
-+{
-+ //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_and_si128(a,b);
-+ tmp2 = _mm_xor_si128(a,b);
-+ tmp2 = _mm_srli_epi32(tmp2,1);
-+ return _mm_add_epi32(tmp1,tmp2);
-+}
-+
-+//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 ***************************
-+//*****************************************************************************************************************************
-+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
-+}
-+
-+
-+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
-+}
-+
-+
-+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
-+{
-+ //no signed average in x86 SIMD, go to unsigned
-+ __m128i c128, au, bu, sum;
-+ c128 = _mm_set1_epi8(0x80); //-128
-+ au = _mm_sub_epi8(a, c128); //add 128
-+ bu = _mm_sub_epi8(b, c128); //add 128
-+ sum = _mm_avg_epu8(au, bu);
-+ return _mm_add_epi8 (sum, c128); //sub 128
-+}
-+
-+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
-+{
-+ //no signed average in x86 SIMD, go to unsigned
-+ __m128i cx8000, au, bu, sum;
-+ cx8000 = _mm_set1_epi16(0x8000); // - 32768
-+ au = _mm_sub_epi16(a, cx8000); //add 32768
-+ bu = _mm_sub_epi16(b, cx8000); //add 32768
-+ sum = _mm_avg_epu16(au, bu);
-+ return _mm_add_epi16 (sum, cx8000); //sub 32768
-+}
-+
-+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b)
-+{
-+ //need to avoid overflow
-+ __m128i a2, b2, res, sum;
-+ a2 = _mm_srai_epi32(a,1); //a2=a/2;
-+ b2 = _mm_srai_epi32(b,1); // b2=b/2;
-+ res = _mm_or_si128(a,b); //for rounding
-+ res = _mm_slli_epi32 (res,31); //shift left then back right to
-+ res = _mm_srli_epi32 (res,31); //get 1 or zero
-+ sum = _mm_add_epi32(a2,b2);
-+ return _mm_add_epi32(sum,res);
-+}
-+
-+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
-+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
-+
-+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
-+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
-+
-+
-+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
-+{
-+ //need to avoid overflow
-+ __m128i a2, b2, res, sum;
-+ a2 = _mm_srli_epi32(a,1); //a2=a/2;
-+ b2 = _mm_srli_epi32(b,1); // b2=b/2;
-+ res = _mm_or_si128(a,b); //for rounding
-+ res = _mm_slli_epi32 (res,31); //shift left then back right to
-+ res = _mm_srli_epi32 (res,31); //get 1 or zero
-+ sum = _mm_add_epi32(a2,b2);
-+ return _mm_add_epi32(sum,res);
-+}
-+
-+//****************** VQADD: Vector saturating add ************************
-+//************************************************************************
-+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int64x1_t res;
-+ uint64_t a64, b64;
-+ a64 = a.m64_u64[0];
-+ b64 = b.m64_u64[0];
-+ res.m64_u64[0] = a64 + b64;
-+ a64 = (a64 >> 63) + (~_SIGNBIT64);
-+ if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
-+ res.m64_u64[0] = a64;
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t a64, b64;
-+ uint64x1_t res;
-+ a64 = a.m64_u64[0];
-+ b64 = b.m64_u64[0];
-+ res.m64_u64[0] = a64 + b64;
-+ if (res.m64_u64[0] < a64) {
-+ res.m64_u64[0] = ~(uint64_t)0;
-+ }
-+ return res;
-+}
-+
-+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
-+#define vqaddq_s8 _mm_adds_epi8
-+
-+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
-+#define vqaddq_s16 _mm_adds_epi16
-+
-+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b)
-+{
-+ //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
-+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
-+ c7fffffff = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_add_epi32(a, b);
-+ res_sat = _mm_srli_epi32(a, 31);
-+ res_sat = _mm_add_epi32(res_sat, c7fffffff);
-+ res_xor_a = _mm_xor_si128(res, a);
-+ b_xor_a_ = _mm_xor_si128(b, a);
-+ res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
-+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-+ res_sat = _mm_and_si128(res_xor_a, res_sat);
-+ res = _mm_andnot_si128(res_xor_a, res);
-+ return _mm_or_si128(res, res_sat);
-+}
-+
-+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = atmp[0] + btmp[0];
-+ res[1] = atmp[1] + btmp[1];
-+
-+ atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
-+ atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
-+
-+ if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
-+ res[0] = atmp[0];
-+ }
-+ if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
-+ res[1] = atmp[1];
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
-+#define vqaddq_u8 _mm_adds_epu8
-+
-+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
-+#define vqaddq_u16 _mm_adds_epu16
-+
-+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
-+{
-+ __m128i c80000000, cmp, subsum, suba, sum;
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+ sum = _mm_add_epi32 (a, b);
-+ subsum = _mm_sub_epi32 (sum, c80000000);
-+ suba = _mm_sub_epi32 (a, c80000000);
-+ cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
-+ return _mm_or_si128 (sum, cmp); //saturation
-+}
-+
-+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
-+ {
-+ __m128i c80000000, sum, cmp, suba, subsum;
-+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
-+ sum = _mm_add_epi64 (a, b);
-+ subsum = _mm_sub_epi64 (sum, c80000000);
-+ suba = _mm_sub_epi64 (a, c80000000);
-+ cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
-+ return _mm_or_si128 (sum, cmp); //saturation
-+ }
-+#else
-+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+ {
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = atmp[0] + btmp[0];
-+ res[1] = atmp[1] + btmp[1];
-+ if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
-+ if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
-+ return _mm_load_si128((__m128i*)(res));
-+ }
-+#endif
-+
-+
-+//******************* Vector add high half (truncated) ******************
-+//************************************************************************
-+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi16 (a, b);
-+ sum = _mm_srai_epi16 (sum, 8);
-+ sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
-+{
-+ int16x4_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi32 (a, b);
-+ sum = _mm_srai_epi32(sum, 16);
-+ sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi64 (a, b);
-+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6));
-+ return64(sum);
-+}
-+
-+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi16 (a, b);
-+ sum = _mm_srli_epi16 (sum, 8);
-+ sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
-+{
-+ uint16x4_t res64;
-+ __m128i sum;
-+ sum = _mm_add_epi32 (a, b);
-+ sum = _mm_srli_epi32 (sum, 16);
-+ sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
-+ return64(sum);
-+}
-+
-+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
-+#define vaddhn_u64 vaddhn_s64
-+
-+//*********** Vector rounding add high half: vraddhn_<type> ******************.
-+//***************************************************************************
-+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sum = _mm_srai_epi16 (sum, 8); //get high half
-+ sum = _mm_add_epi16 (sum, mask1); //actual rounding
-+ sum = _mm_packs_epi16 (sum, sum);
-+ return64(sum);
-+}
-+
-+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int16x4_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sum = _mm_srai_epi32 (sum, 16); //get high half
-+ sum = _mm_add_epi32 (sum, mask1); //actual rounding
-+ sum = _mm_packs_epi32 (sum, sum);
-+ return64(sum);
-+}
-+
-+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int32x2_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi64 (a, b);
-+ mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
-+ sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
-+ sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6));
-+ return64(sum);
-+}
-+
-+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sum = _mm_srai_epi16 (sum, 8); //get high half
-+ sum = _mm_add_epi16 (sum, mask1); //actual rounding
-+ sum = _mm_packus_epi16 (sum, sum);
-+ return64(sum);
-+}
-+
-+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ uint16x4_t res64;
-+ __m128i sum, mask1;
-+ sum = _mm_add_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sum = _mm_srai_epi32 (sum, 16); //get high half
-+ sum = _mm_add_epi32 (sum, mask1); //actual rounding
-+ sum = _MM_PACKUS1_EPI32 (sum);
-+ return64(sum);
-+}
-+
-+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
-+#define vraddhn_u64 vraddhn_s64
-+
-+//**********************************************************************************
-+//********* Multiplication *************************************
-+//**************************************************************************************
-+
-+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-+//As we don't go to wider result functions are equal to "multiply low" in x86
-+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits in SSE
-+ int8x8_t res64;
-+ __m128i a128, b128, res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
-+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-+ res = _mm_mullo_epi16 (a128, b128);
-+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
-+ return64(res);
-+}
-+
-+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
-+#define vmul_s16 vmul_u16
-+
-+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
-+#define vmul_s32 vmul_u32
-+
-+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x4_t tmp;
-+ __m64_128 res64;
-+ tmp = _mm_mul_ps(_pM128(a),_pM128(b));
-+ _M64f(res64, tmp); //use low 64 bits
-+ return res64;
-+}
-+
-+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits in SSE
-+ uint8x8_t res64;
-+ __m128i mask, a128, b128, res;
-+ mask = _mm_set1_epi16(0xff);
-+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
-+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
-+ res = _mm_mullo_epi16 (a128, b128);
-+ res = _mm_and_si128(res, mask); //to avoid saturation
-+ res = _mm_packus_epi16 (res,res); //use only low 64 bits
-+ return64(res);
-+}
-+
-+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
-+ res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
-+ return res;
-+}
-+
-+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
-+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
-+{
-+ //may be optimized
-+ poly8x8_t res64;
-+ __m128i a64, b64, c1, res, tmp, bmasked;
-+ int i;
-+ a64 = _pM128i(a);
-+ b64 = _pM128i(b);
-+ c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
-+ c1 = vshrq_n_u8(c1,7); //0x1
-+ bmasked = _mm_and_si128(b64, c1); //0x1
-+ res = vmulq_u8(a64, bmasked);
-+ for(i = 1; i<8; i++) {
-+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-+ bmasked = _mm_and_si128(b64, c1); //0x1
-+ tmp = vmulq_u8(a64, bmasked);
-+ res = _mm_xor_si128(res, tmp);
-+ }
-+ return64 (res);
-+}
-+
-+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ //solution may be not optimal
-+ __m128i a16, b16, r16_1, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (a16, b16);
-+ //swap hi and low part of a and b to process the remaining data
-+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2
-+
-+ r16_2 = _mm_mullo_epi16 (a16, b16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
-+
-+ return _mm_unpacklo_epi64(r16_1, r16_2);
-+}
-+
-+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
-+#define vmulq_s16 _mm_mullo_epi16
-+
-+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
-+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
-+
-+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
-+#define vmulq_f32 _mm_mul_ps
-+
-+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ //solution may be not optimal
-+ __m128i maskff, a16, b16, r16_1, r16_2;
-+ maskff = _mm_set1_epi16(0xff);
-+ a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
-+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (a16, b16);
-+ r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
-+ //swap hi and low part of a and b to process the remaining data
-+ a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (a16, b16);
-+ r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
-+ return _mm_packus_epi16 (r16_1, r16_2);
-+}
-+
-+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
-+#define vmulq_u16 _mm_mullo_epi16
-+
-+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
-+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
-+
-+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
-+_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
-+{
-+ //may be optimized
-+ __m128i c1, res, tmp, bmasked;
-+ int i;
-+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
-+ c1 = vshrq_n_u8(c1,7); //0x1
-+ bmasked = _mm_and_si128(b, c1); //0x1
-+ res = vmulq_u8(a, bmasked);
-+ for(i = 1; i<8; i++) {
-+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-+ bmasked = _mm_and_si128(b, c1); //0x1
-+ tmp = vmulq_u8(a, bmasked);
-+ res = _mm_xor_si128(res, tmp);
-+ }
-+ return res;
-+}
-+
-+//************************* Vector long multiply ***********************************
-+//****************************************************************************
-+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
-+{
-+ //no 8 bit simd multiply, need to go to 16 bits
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
-+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
-+}
-+
-+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
-+{
-+ #ifdef USE_SSE4
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
-+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
-+ #else
-+ __m128i low, hi, a128,b128;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ low = _mm_mullo_epi16(a128,b128);
-+ hi = _mm_mulhi_epi16(a128,b128);
-+ return _mm_unpacklo_epi16(low,hi);
-+ #endif
-+}
-+
-+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
-+{
-+ __m128i ab, ba, a128, b128;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
-+ return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+}
-+
-+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
-+{
-+ //no 8 bit simd multiply, need to go to 16 bits
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
-+ return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
-+}
-+
-+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
-+{
-+ #ifdef USE_SSE4
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
-+ return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
-+ #else
-+ __m128i a128,b128,low, hi;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ low = _mm_mullo_epi16(a128,b128);
-+ hi = _mm_mulhi_epu16(a128,b128);
-+ return _mm_unpacklo_epi16(low,hi);
-+ #endif
-+}
-+
-+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
-+{
-+ ///may be not optimal compared with serial implementation
-+ __m128i ab, ba, a128, b128;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
-+ return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+}
-+
-+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
-+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
-+{
-+ //may be optimized
-+ __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
-+ int i;
-+ a128 = _pM128i(a);
-+ b128 = _pM128i(b);
-+ c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
-+ c1 = vshrq_n_u8(c1,7); //0x1
-+ bmasked = _mm_and_si128(b128, c1); //0x1
-+
-+ a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
-+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
-+ res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
-+ for(i = 1; i<8; i++) {
-+ c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-+ bmasked = _mm_and_si128(b128, c1); //0x1
-+ bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
-+ tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
-+ res = _mm_xor_si128(res, tmp);
-+ }
-+ return res;
-+}
-+
-+//****************Vector saturating doubling long multiply **************************
-+//*****************************************************************
-+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
-+{
-+ //the serial soulution may be faster due to saturation
-+ __m128i res;
-+ res = vmull_s16(a, b);
-+ return vqd_s32(res);
-+}
-+
-+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //the serial soulution may be faster due to saturation
-+ __m128i res;
-+ res = vmull_s32(a,b);
-+ return vqaddq_s64(res,res); //slow serial function!!!!
-+}
-+
-+//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
-+//******************************************************************************************
-+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
-+{
-+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
-+ int8x8_t res64;
-+ __m128i b128, c128, res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-+ c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
-+ res = _mm_mullo_epi16 (c128, b128);
-+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
-+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ int16x4_t res64;
-+ return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
-+}
-+
-+
-+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
-+ res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
-+ return64(res);
-+}
-+
-+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
-+{
-+ //fma is coming soon, but right now:
-+ __m128 res;
-+ __m64_128 res64;
-+ res = _mm_mul_ps (_pM128(c), _pM128(b));
-+ res = _mm_add_ps (_pM128(a), res);
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
-+{
-+ // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
-+ uint8x8_t res64;
-+ __m128i mask, b128, c128, res;
-+ mask = _mm_set1_epi16(0xff);
-+ b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-+ c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
-+ res = _mm_mullo_epi16 (c128, b128);
-+ res = _mm_and_si128(res, mask); //to avoid saturation
-+ res = _mm_packus_epi16 (res, res);
-+ res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
-+ return64(res);
-+}
-+
-+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
-+#define vmla_u16 vmla_s16
-+
-+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
-+#define vmla_u32 vmla_s32
-+
-+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2,r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-+ r16_1 = _mm_add_epi8 (r16_1, a);
-+ //swap hi and low part of a, b and c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_add_epi8(r16_2, a_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_mullo_epi16 (c, b);
-+ return _mm_add_epi16 (res, a);
-+}
-+
-+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1
-+ return _mm_add_epi32 (res, a);
-+}
-+
-+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
-+{
-+ //fma is coming soon, but right now:
-+ __m128 res;
-+ res = _mm_mul_ps (c, b);
-+ return _mm_add_ps (a, res);
-+}
-+
-+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-+ r16_1 = _mm_add_epi8 (r16_1, a);
-+ //swap hi and low part of a, b and c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_add_epi8(r16_2, a_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
-+#define vmlaq_u16 vmlaq_s16
-+
-+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
-+#define vmlaq_u32 vmlaq_s32
-+
-+//********************** Vector widening multiply accumulate (long multiply accumulate):
-+// vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
-+//********************************************************************************************
-+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
-+{
-+ int16x8_t res;
-+ res = vmull_s8(b, c);
-+ return _mm_add_epi16 (res, a);
-+}
-+
-+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int32x4_t res;
-+ res = vmull_s16(b, c);
-+ return _mm_add_epi32 (res, a);
-+}
-+
-+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_s32( b, c);
-+ return _mm_add_epi64 (res, a);
-+}
-+
-+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
-+{
-+ uint16x8_t res;
-+ res = vmull_u8(b, c);
-+ return _mm_add_epi16 (res, a);
-+}
-+
-+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ uint32x4_t res;
-+ res = vmull_u16(b, c);
-+ return _mm_add_epi32 (res, a);
-+}
-+
-+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_u32( b,c);
-+ return _mm_add_epi64 (res, a);
-+}
-+
-+//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
-+//********************************************************************************************
-+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
-+ int8x8_t res64;
-+ __m128i res;
-+ res64 = vmul_s8(b,c);
-+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
-+ return64(res);
-+}
-+
-+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ int16x4_t res64;
-+ return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
-+}
-+
-+
-+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
-+ res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
-+ return64(res);
-+}
-+
-+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
-+{
-+ __m128 res;
-+ __m64_128 res64;
-+ res = _mm_mul_ps (_pM128(c), _pM128(b));
-+ res = _mm_sub_ps (_pM128(a), res);
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+ // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
-+ uint8x8_t res64;
-+ __m128i res;
-+ res64 = vmul_u8(b,c);
-+ res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
-+ return64(res);
-+}
-+
-+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
-+#define vmls_u16 vmls_s16
-+
-+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
-+#define vmls_u32 vmls_s32
-+
-+
-+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
-+ r16_1 = _mm_sub_epi8 (a, r16_1);
-+ //swap hi and low part of a, b, c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_sub_epi8 (a_2, r16_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_mullo_epi16 (c, b);
-+ return _mm_sub_epi16 (a, res);
-+}
-+
-+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _MM_MULLO_EPI32 (c, b); //SSE4.1
-+ return _mm_sub_epi32 (a, res);
-+}
-+
-+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
-+{
-+ __m128 res;
-+ res = _mm_mul_ps (c, b);
-+ return _mm_sub_ps (a, res);
-+}
-+
-+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
-+{
-+ //solution may be not optimal
-+ // no 8 bit simd multiply, need to go to 16 bits
-+ __m128i b16, c16, r16_1, a_2, r16_2;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
-+ r16_1 = _mm_mullo_epi16 (b16, c16);
-+ r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-+ r16_1 = _mm_sub_epi8 (a, r16_1);
-+ //swap hi and low part of a, b and c to process the remaining data
-+ a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-+ c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-+ b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
-+ c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
-+
-+ r16_2 = _mm_mullo_epi16 (b16, c16);
-+ r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-+ r16_2 = _mm_sub_epi8(a_2, r16_2);
-+ return _mm_unpacklo_epi64(r16_1,r16_2);
-+}
-+
-+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
-+#define vmlsq_u16 vmlsq_s16
-+
-+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
-+#define vmlsq_u32 vmlsq_s32
-+
-+//******************** Vector multiply subtract long (widening multiply subtract) ************************************
-+//*************************************************************************************************************
-+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
-+{
-+ int16x8_t res;
-+ res = vmull_s8(b, c);
-+ return _mm_sub_epi16 (a, res);
-+}
-+
-+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int32x4_t res;
-+ res = vmull_s16(b, c);
-+ return _mm_sub_epi32 (a, res);
-+}
-+
-+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_s32( b,c);
-+ return _mm_sub_epi64 (a, res);
-+}
-+
-+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
-+{
-+ uint16x8_t res;
-+ res = vmull_u8(b, c);
-+ return _mm_sub_epi16 (a, res);
-+}
-+
-+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ uint32x4_t res;
-+ res = vmull_u16(b, c);
-+ return _mm_sub_epi32 (a, res);
-+}
-+
-+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
-+{
-+ //may be not optimal compared with serial implementation
-+ int64x2_t res;
-+ res = vmull_u32( b,c);
-+ return _mm_sub_epi64 (a, res);
-+}
-+
-+//****** Vector saturating doubling multiply high **********************
-+//*************************************************************************
-+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int16x4_t res;
-+ int32_t a32, b32, i;
-+ for (i = 0; i<4; i++) {
-+ a32 = (int32_t) a.m64_i16[i];
-+ b32 = (int32_t) b.m64_i16[i];
-+ a32 = (a32 * b32) >> 15;
-+ res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
-+ }
-+ return res;
-+}
-+
-+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
-+{
-+ //may be not optimal compared with a serial solution
-+ int32x2_t res64;
-+ __m128i mask;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ int64x2_t mul;
-+ mul = vmull_s32(a,b);
-+ mul = _mm_slli_epi64(mul,1); //double the result
-+ //at this point start treating 2 64-bit numbers as 4 32-bit
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+ return64(mul);
-+}
-+
-+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
-+{
-+ __m128i res, res_lo, mask;
-+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
-+ res = _mm_mulhi_epi16 (a, b);
-+ res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
-+ res_lo = _mm_mullo_epi16 (a, b);
-+ res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
-+ res = _mm_add_epi16(res, res_lo); //combine results
-+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
-+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000
-+}
-+
-+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-+ __m128i ab, ba, mask, mul, mul1;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
-+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul = _mm_slli_epi64(mul,1); //double the result
-+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
-+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
-+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul1 = _mm_slli_epi64(mul1,1); //double the result
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-+ mul = _mm_unpacklo_epi64(mul, mul1);
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+}
-+
-+//********* Vector saturating rounding doubling multiply high ****************
-+//****************************************************************************
-+//If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
-+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //may be not optimal compared with a serial solution
-+ int32x2_t res64;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ __m128i res_sat, mask, mask1;
-+ int64x2_t mul;
-+ mul = vmull_s32(a,b);
-+ res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
-+ mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
-+ mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
-+ //at this point start treating 2 64-bit numbers as 4 32-bit
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+ return64(mul);
-+}
-+
-+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
-+{
-+ __m128i mask, res;
-+ _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
-+ res = _mm_mulhrs_epi16 (a, b);
-+ mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
-+ return _mm_xor_si128 (res, mask); //res saturated for 0x8000
-+}
-+
-+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-+ __m128i ab, ba, mask, mul, mul1, mask1;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
-+ ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
-+ mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
-+ mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
-+ mul = _mm_add_epi32 (mul, mask1); //actual rounding
-+
-+ ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
-+ ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
-+ mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-+ mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
-+ mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
-+ mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
-+ //at this point start treating 2 64-bit numbers as 4 32-bit
-+ mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-+ mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-+ mul = _mm_unpacklo_epi64(mul, mul1);
-+ mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-+ return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
-+}
-+
-+//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
-+//*************************************************************************************************************************
-+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
-+{
-+ //not optimal SIMD soulution, serial may be faster
-+ __m128i res32;
-+ res32 = vmull_s16(b, c);
-+ res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
-+ return vqaddq_s32(res32, a); //saturation
-+}
-+
-+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res64;
-+ res64 = vmull_s32(b,c);
-+ res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
-+ return vqaddq_s64(res64, a); //saturation
-+}
-+
-+//************************************************************************************
-+//****************** Vector subtract ***********************************************
-+//************************************************************************************
-+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res64;
-+ res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
-+ return res64;
-+}
-+
-+
-+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
-+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
-+ return res;
-+}
-+
-+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
-+#define vsub_u8 vsub_s8
-+
-+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
-+#define vsub_u16 vsub_s16
-+
-+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
-+#define vsub_u32 vsub_s32
-+
-+
-+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
-+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b)
-+{
-+ int64x1_t res64;
-+ res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
-+ return res64;
-+}
-+
-+
-+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
-+#define vsubq_s8 _mm_sub_epi8
-+
-+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
-+#define vsubq_s16 _mm_sub_epi16
-+
-+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
-+#define vsubq_s32 _mm_sub_epi32
-+
-+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
-+#define vsubq_s64 _mm_sub_epi64
-+
-+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
-+#define vsubq_f32 _mm_sub_ps
-+
-+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
-+#define vsubq_u8 _mm_sub_epi8
-+
-+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
-+#define vsubq_u16 _mm_sub_epi16
-+
-+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
-+#define vsubq_u32 _mm_sub_epi32
-+
-+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
-+#define vsubq_u64 _mm_sub_epi64
-+
-+//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
-+//***********************************************************************************
-+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a16, b16);
-+}
-+
-+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a32, b32);
-+}
-+
-+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
-+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi64 (a64, b64);
-+}
-+
-+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a16, b16);
-+}
-+
-+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a32, b32);
-+}
-+
-+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
-+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
-+{
-+ //may be not optimal
-+ __m128i a64, b64;
-+ a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi64 (a64, b64);
-+}
-+
-+//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
-+//*****************************************************************************************************
-+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
-+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a, b16);
-+}
-+
-+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
-+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a, b32);
-+}
-+
-+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
-+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_sub_epi64 (a, b64);
-+}
-+
-+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
-+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
-+{
-+ __m128i b16;
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi16 (a, b16);
-+}
-+
-+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
-+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
-+{
-+ __m128i b32;
-+ b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return _mm_sub_epi32 (a, b32);
-+}
-+
-+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
-+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
-+{
-+ __m128i b64;
-+ b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-+ return _mm_sub_epi64 (a, b64);
-+}
-+
-+//************************Vector saturating subtract *********************************
-+//*************************************************************************************
-+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
-+{
-+ uint64x1_t res;
-+ uint64_t a64,b64;
-+ a64 = a.m64_u64[0];
-+ b64 = b.m64_u64[0];
-+ res.m64_u64[0] = a64 - b64;
-+
-+ a64 = (a64 >> 63) + (~_SIGNBIT64);
-+ if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
-+ res.m64_u64[0] = a64;
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint64x1_t res;
-+ uint64_t a64, b64;
-+ a64 = _Ui64(a);
-+ b64 = _Ui64(b);
-+ if (a64 > b64) {
-+ res.m64_u64[0] = a64 - b64;
-+ } else {
-+ res.m64_u64[0] = 0;
-+ }
-+ return res;
-+}
-+
-+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
-+#define vqsubq_s8 _mm_subs_epi8
-+
-+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
-+#define vqsubq_s16 _mm_subs_epi16
-+
-+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
-+{
-+ //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
-+ __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
-+ c7fffffff = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_sub_epi32(a, b);
-+ res_sat = _mm_srli_epi32(a, 31);
-+ res_sat = _mm_add_epi32(res_sat, c7fffffff);
-+ res_xor_a = _mm_xor_si128(res, a);
-+ b_xor_a = _mm_xor_si128(b, a);
-+ res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
-+ res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-+ res_sat = _mm_and_si128(res_xor_a, res_sat);
-+ res = _mm_andnot_si128(res_xor_a, res);
-+ return _mm_or_si128(res, res_sat);
-+}
-+
-+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
-+{
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = atmp[0] - btmp[0];
-+ res[1] = atmp[1] - btmp[1];
-+ if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
-+ res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
-+ }
-+ if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
-+ res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
-+#define vqsubq_u8 _mm_subs_epu8
-+
-+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
-+#define vqsubq_u16 _mm_subs_epu16
-+
-+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
-+{
-+ __m128i min, mask, sub;
-+ min = _MM_MIN_EPU32(a, b); //SSE4.1
-+ mask = _mm_cmpeq_epi32 (min, b);
-+ sub = _mm_sub_epi32 (a, b);
-+ return _mm_and_si128 ( sub, mask);
-+}
-+
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
-+ {
-+ __m128i c80000000, subb, suba, cmp, sub;
-+ c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
-+ sub = _mm_sub_epi64 (a, b);
-+ suba = _mm_sub_epi64 (a, c80000000);
-+ subb = _mm_sub_epi64 (b, c80000000);
-+ cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
-+ return _mm_and_si128 (sub, cmp); //saturation
-+ }
-+#else
-+ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+ {
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ _mm_store_si128((__m128i*)btmp, b);
-+ res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0;
-+ res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0;
-+ return _mm_load_si128((__m128i*)(res));
-+ }
-+#endif
-+
-+//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ******************************************************
-+//****************************************************************
-+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
-+{
-+ //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
-+ int8x8_t res64;
-+ __m128i r16;
-+ int8x8_t r;
-+ r = vsub_s8 (a, b);
-+ r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
-+ r16 = _mm_srai_epi16 (r16, 1); //SSE2
-+ r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits
-+ return64(r16);
-+}
-+
-+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+
-+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
-+{
-+ // //need to deal with the possibility of internal overflow
-+ __m128i c128, au,bu;
-+ c128 = _mm_set1_epi8 (128);
-+ au = _mm_add_epi8( a, c128);
-+ bu = _mm_add_epi8( b, c128);
-+ return vhsubq_u8(au,bu);
-+}
-+
-+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
-+{
-+ //need to deal with the possibility of internal overflow
-+ __m128i c8000, au,bu;
-+ c8000 = _mm_set1_epi16(0x8000);
-+ au = _mm_add_epi16( a, c8000);
-+ bu = _mm_add_epi16( b, c8000);
-+ return vhsubq_u16(au,bu);
-+}
-+
-+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
-+{
-+ //need to deal with the possibility of internal overflow
-+ __m128i a2, b2,r, b_1;
-+ a2 = _mm_srai_epi32 (a,1);
-+ b2 = _mm_srai_epi32 (b,1);
-+ r = _mm_sub_epi32 (a2, b2);
-+ b_1 = _mm_andnot_si128(a, b); //!a and b
-+ b_1 = _mm_slli_epi32 (b_1,31);
-+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
-+ return _mm_sub_epi32(r,b_1);
-+}
-+
-+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
-+{
-+ __m128i avg;
-+ avg = _mm_avg_epu8 (a, b);
-+ return _mm_sub_epi8(a, avg);
-+}
-+
-+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
-+{
-+ __m128i avg;
-+ avg = _mm_avg_epu16 (a, b);
-+ return _mm_sub_epi16(a, avg);
-+}
-+
-+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
-+{
-+ //need to deal with the possibility of internal overflow
-+ __m128i a2, b2,r, b_1;
-+ a2 = _mm_srli_epi32 (a,1);
-+ b2 = _mm_srli_epi32 (b,1);
-+ r = _mm_sub_epi32 (a2, b2);
-+ b_1 = _mm_andnot_si128(a, b); //!a and b
-+ b_1 = _mm_slli_epi32 (b_1,31);
-+ b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
-+ return _mm_sub_epi32(r,b_1);
-+}
-+
-+//******* Vector subtract high half (truncated) ** ************
-+//************************************************************
-+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sum, sum8;
-+ sum = _mm_sub_epi16 (a, b);
-+ sum8 = _mm_srai_epi16 (sum, 8);
-+ sum8 = _mm_packs_epi16(sum8,sum8);
-+ return64(sum8);
-+}
-+
-+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
-+{
-+ int16x4_t res64;
-+ __m128i sum, sum16;
-+ sum = _mm_sub_epi32 (a, b);
-+ sum16 = _mm_srai_epi32 (sum, 16);
-+ sum16 = _mm_packs_epi32(sum16,sum16);
-+ return64(sum16);
-+}
-+
-+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i sub;
-+ sub = _mm_sub_epi64 (a, b);
-+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
-+ return64(sub);
-+}
-+
-+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sum, sum8;
-+ sum = _mm_sub_epi16 (a, b);
-+ sum8 = _mm_srli_epi16 (sum, 8);
-+ sum8 = _mm_packus_epi16(sum8,sum8);
-+ return64(sum8);
-+}
-+
-+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
-+{
-+ uint16x4_t res64;
-+ __m128i sum, sum16;
-+ sum = _mm_sub_epi32 (a, b);
-+ sum16 = _mm_srli_epi32 (sum, 16);
-+ sum16 = _MM_PACKUS1_EPI32(sum16);
-+ return64(sum16);
-+}
-+
-+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
-+#define vsubhn_u64 vsubhn_s64
-+
-+//************ Vector rounding subtract high half *********************
-+//*********************************************************************
-+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
-+{
-+ int8x8_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sub = _mm_srai_epi16 (sub, 8); //get high half
-+ sub = _mm_add_epi16 (sub, mask1); //actual rounding
-+ sub = _mm_packs_epi16 (sub, sub);
-+ return64(sub);
-+}
-+
-+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int16x4_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sub = _mm_srai_epi32 (sub, 16); //get high half
-+ sub = _mm_add_epi32 (sub, mask1); //actual rounding
-+ sub = _mm_packs_epi32 (sub, sub);
-+ return64(sub);
-+}
-+
-+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ int32x2_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi64 (a, b);
-+ mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
-+ mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
-+ sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
-+ sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
-+ return64(sub);
-+}
-+
-+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
-+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
-+{
-+ uint8x8_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi16 (a, b);
-+ mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
-+ mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
-+ sub = _mm_srai_epi16 (sub, 8); //get high half
-+ sub = _mm_add_epi16 (sub, mask1); //actual rounding
-+ sub = _mm_packus_epi16 (sub, sub);
-+ return64(sub);
-+}
-+
-+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
-+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
-+{
-+ //SIMD may be not optimal, serial may be faster
-+ uint16x4_t res64;
-+ __m128i sub, mask1;
-+ sub = _mm_sub_epi32 (a, b);
-+ mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
-+ mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
-+ sub = _mm_srai_epi32 (sub, 16); //get high half
-+ sub = _mm_add_epi32 (sub, mask1); //actual rounding
-+ sub = _MM_PACKUS1_EPI32 (sub);
-+ return64(sub);
-+}
-+
-+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
-+#define vrsubhn_u64 vrsubhn_s64
-+
-+//*********** Vector saturating doubling multiply subtract long ********************
-+//************************************************************************************
-+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ //not optimal SIMD soulution, serial may be faster
-+ __m128i res32, mask;
-+ int32x4_t res;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ res = vmull_s16(b, c);
-+ res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
-+ mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
-+ res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000
-+ return vqsubq_s32(a, res32); //saturation
-+}
-+
-+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res64, mask;
-+ int64x2_t res;
-+ _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
-+ res = vmull_s32(b, c);
-+ res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
-+ mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
-+ res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000
-+ return vqsubq_s64(a, res64); //saturation
-+}
-+
-+//****************** COMPARISON ***************************************
-+//******************* Vector compare equal *************************************
-+//****************************************************************************
-+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
-+ return64f(res);
-+}
-+
-+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
-+#define vceq_p8 vceq_u8
-+
-+
-+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
-+#define vceqq_s8 _mm_cmpeq_epi8
-+
-+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
-+#define vceqq_s16 _mm_cmpeq_epi16
-+
-+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
-+#define vceqq_s32 _mm_cmpeq_epi32
-+
-+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmpeq_ps(a,b);
-+ return _M128i(res);
-+}
-+
-+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
-+#define vceqq_u8 _mm_cmpeq_epi8
-+
-+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
-+#define vceqq_u16 _mm_cmpeq_epi16
-+
-+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
-+#define vceqq_u32 _mm_cmpeq_epi32
-+
-+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
-+#define vceqq_p8 _mm_cmpeq_epi8
-+
-+//******************Vector compare greater-than or equal*************************
-+//*******************************************************************************
-+//in IA SIMD no greater-than-or-equal comparison for integers,
-+// there is greater-than available only, so we need the following tricks
-+
-+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
-+ return64f(res);
-+}
-+
-+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ //serial solution looks faster
-+ uint32x2_t res64;
-+ return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+
-+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
-+{
-+ __m128i m1, m2;
-+ m1 = _mm_cmpgt_epi8 ( a, b);
-+ m2 = _mm_cmpeq_epi8 ( a, b);
-+ return _mm_or_si128 ( m1, m2);
-+}
-+
-+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
-+{
-+ __m128i m1, m2;
-+ m1 = _mm_cmpgt_epi16 ( a, b);
-+ m2 = _mm_cmpeq_epi16 ( a, b);
-+ return _mm_or_si128 ( m1,m2);
-+}
-+
-+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
-+{
-+ __m128i m1, m2;
-+ m1 = _mm_cmpgt_epi32 (a, b);
-+ m2 = _mm_cmpeq_epi32 (a, b);
-+ return _mm_or_si128 (m1, m2);
-+}
-+
-+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmpge_ps(a,b); //use only 2 first entries
-+ return *(__m128i*)&res;
-+}
-+
-+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
-+{
-+ //no unsigned chars comparison, only signed available,so need the trick
-+ #ifdef USE_SSE4
-+ __m128i cmp;
-+ cmp = _mm_max_epu8(a, b);
-+ return _mm_cmpeq_epi8(cmp, a); //a>=b
-+ #else
-+ __m128i c128, as, bs, m1, m2;
-+ c128 = _mm_set1_epi8 (128);
-+ as = _mm_sub_epi8( a, c128);
-+ bs = _mm_sub_epi8( b, c128);
-+ m1 = _mm_cmpgt_epi8( as, bs);
-+ m2 = _mm_cmpeq_epi8 (as, bs);
-+ return _mm_or_si128 ( m1, m2);
-+ #endif
-+}
-+
-+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
-+{
-+ //no unsigned shorts comparison, only signed available,so need the trick
-+ #ifdef USE_SSE4
-+ __m128i cmp;
-+ cmp = _mm_max_epu16(a, b);
-+ return _mm_cmpeq_epi16(cmp, a); //a>=b
-+ #else
-+ __m128i c8000, as, bs, m1, m2;
-+ c8000 = _mm_set1_epi16 (0x8000);
-+ as = _mm_sub_epi16(a,c8000);
-+ bs = _mm_sub_epi16(b,c8000);
-+ m1 = _mm_cmpgt_epi16(as, bs);
-+ m2 = _mm_cmpeq_epi16 (as, bs);
-+ return _mm_or_si128 ( m1, m2);
-+ #endif
-+}
-+
-+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
-+{
-+ //no unsigned ints comparison, only signed available,so need the trick
-+ #ifdef USE_SSE4
-+ __m128i cmp;
-+ cmp = _mm_max_epu32(a, b);
-+ return _mm_cmpeq_epi32(cmp, a); //a>=b
-+ #else
-+ //serial solution may be faster
-+ __m128i c80000000, as, bs, m1, m2;
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+ as = _mm_sub_epi32(a,c80000000);
-+ bs = _mm_sub_epi32(b,c80000000);
-+ m1 = _mm_cmpgt_epi32 (as, bs);
-+ m2 = _mm_cmpeq_epi32 (as, bs);
-+ return _mm_or_si128 ( m1, m2);
-+ #endif
-+}
-+
-+//**********************Vector compare less-than or equal******************************
-+//***************************************************************************************
-+//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
-+
-+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vcleq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vcleq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vcleq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
-+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmple_ps(_pM128(a),_pM128(b));
-+ return64f(res);
-+}
-+
-+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-+#define vcle_u8(a,b) vcge_u8(b,a)
-+
-+
-+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
-+#define vcle_u16(a,b) vcge_u16(b,a)
-+
-+
-+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-+#define vcle_u32(a,b) vcge_u32(b,a)
-+
-+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
-+{
-+ __m128i c1, res;
-+ c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
-+ res = _mm_cmpgt_epi8 ( a, b);
-+ return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
-+}
-+
-+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
-+{
-+ __m128i c1, res;
-+ c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
-+ res = _mm_cmpgt_epi16 ( a, b);
-+ return _mm_andnot_si128 (res, c1);
-+}
-+
-+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
-+{
-+ __m128i c1, res;
-+ c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
-+ res = _mm_cmpgt_epi32 ( a, b);
-+ return _mm_andnot_si128 (res, c1);
-+}
-+
-+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmple_ps(a,b);
-+ return *(__m128i*)&res;
-+}
-+
-+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
-+ {
-+ //no unsigned chars comparison in SSE, only signed available,so need the trick
-+ __m128i cmp;
-+ cmp = _mm_min_epu8(a, b);
-+ return _mm_cmpeq_epi8(cmp, a); //a<=b
-+ }
-+#else
-+ #define vcleq_u8(a,b) vcgeq_u8(b,a)
-+#endif
-+
-+
-+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
-+ {
-+ //no unsigned shorts comparison in SSE, only signed available,so need the trick
-+ __m128i cmp;
-+ cmp = _mm_min_epu16(a, b);
-+ return _mm_cmpeq_epi16(cmp, a); //a<=b
-+ }
-+#else
-+ #define vcleq_u16(a,b) vcgeq_u16(b,a)
-+#endif
-+
-+
-+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-+#ifdef USE_SSE4
-+ _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
-+ {
-+ //no unsigned chars comparison in SSE, only signed available,so need the trick
-+ __m128i cmp;
-+ cmp = _mm_min_epu32(a, b);
-+ return _mm_cmpeq_epi32(cmp, a); //a<=b
-+ }
-+#else
-+//solution may be not optimal compared with the serial one
-+ #define vcleq_u32(a,b) vcgeq_u32(b,a)
-+#endif
-+
-+
-+//****** Vector compare greater-than ******************************************
-+//**************************************************************************
-+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128 res;
-+ res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
-+ return64f(res);
-+}
-+
-+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+#define vcgtq_s8 _mm_cmpgt_epi8
-+
-+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+#define vcgtq_s16 _mm_cmpgt_epi16
-+
-+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+#define vcgtq_s32 _mm_cmpgt_epi32
-+
-+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
-+{
-+ __m128 res;
-+ res = _mm_cmpgt_ps(a,b); //use only 2 first entries
-+ return *(__m128i*)&res;
-+}
-+
-+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
-+{
-+ //no unsigned chars comparison, only signed available,so need the trick
-+ __m128i c128, as, bs;
-+ c128 = _mm_set1_epi8 (128);
-+ as = _mm_sub_epi8(a,c128);
-+ bs = _mm_sub_epi8(b,c128);
-+ return _mm_cmpgt_epi8 (as, bs);
-+}
-+
-+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
-+{
-+ //no unsigned short comparison, only signed available,so need the trick
-+ __m128i c8000, as, bs;
-+ c8000 = _mm_set1_epi16 (0x8000);
-+ as = _mm_sub_epi16(a,c8000);
-+ bs = _mm_sub_epi16(b,c8000);
-+ return _mm_cmpgt_epi16 ( as, bs);
-+}
-+
-+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
-+{
-+ //no unsigned int comparison, only signed available,so need the trick
-+ __m128i c80000000, as, bs;
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+ as = _mm_sub_epi32(a,c80000000);
-+ bs = _mm_sub_epi32(b,c80000000);
-+ return _mm_cmpgt_epi32 ( as, bs);
-+}
-+
-+//********************* Vector compare less-than **************************
-+//*************************************************************************
-+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
-+
-+
-+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
-+
-+
-+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-+#define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
-+
-+
-+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
-+
-+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
-+
-+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
-+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
-+
-+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
-+
-+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
-+
-+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
-+
-+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
-+
-+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
-+
-+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
-+
-+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
-+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
-+
-+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
-+
-+//*****************Vector compare absolute greater-than or equal ************
-+//***************************************************************************
-+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpge_ps ( a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpge_ps ( a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//********Vector compare absolute less-than or equal ******************
-+//********************************************************************
-+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmple_ps (a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmple_ps (a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//******** Vector compare absolute greater-than ******************
-+//******************************************************************
-+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpgt_ps (a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmpgt_ps (a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//***************Vector compare absolute less-than ***********************
-+//*************************************************************************
-+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-+ a0 = _mm_cmplt_ps (a0, b0);
-+ return64f(a0);
-+}
-+
-+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
-+{
-+ __m128i c7fffffff;
-+ __m128 a0, b0;
-+ c7fffffff = _mm_set1_epi32 (0x7fffffff);
-+ a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-+ b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-+ a0 = _mm_cmplt_ps (a0, b0);
-+ return (*(__m128i*)&a0);
-+}
-+
-+//*************************Vector test bits************************************
-+//*****************************************************************************
-+/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
-+with the corresponding element of a second vector. If the result is not zero, the
-+corresponding element in the destination vector is set to all ones. Otherwise, it is set to
-+all zeros. */
-+
-+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
-+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vtstq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
-+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vtstq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
-+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vtstq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
-+#define vtst_u8 vtst_s8
-+
-+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
-+#define vtst_u16 vtst_s16
-+
-+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
-+#define vtst_u32 vtst_s32
-+
-+
-+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
-+#define vtst_p8 vtst_u8
-+
-+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
-+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
-+{
-+ __m128i zero, one, res;
-+ zero = _mm_setzero_si128 ();
-+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-+ res = _mm_and_si128 (a, b);
-+ res = _mm_cmpeq_epi8 (res, zero);
-+ return _mm_xor_si128(res, one); //invert result
-+}
-+
-+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
-+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
-+{
-+ __m128i zero, one, res;
-+ zero = _mm_setzero_si128 ();
-+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-+ res = _mm_and_si128 (a, b);
-+ res = _mm_cmpeq_epi16 (res, zero);
-+ return _mm_xor_si128(res, one); //invert result
-+}
-+
-+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
-+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
-+{
-+ __m128i zero, one, res;
-+ zero = _mm_setzero_si128 ();
-+ one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-+ res = _mm_and_si128 (a, b);
-+ res = _mm_cmpeq_epi32 (res, zero);
-+ return _mm_xor_si128(res, one); //invert result
-+}
-+
-+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
-+#define vtstq_u8 vtstq_s8
-+
-+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
-+#define vtstq_u16 vtstq_s16
-+
-+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
-+#define vtstq_u32 vtstq_s32
-+
-+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
-+#define vtstq_p8 vtstq_u8
-+
-+//****************** Absolute difference ********************
-+//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
-+//************************************************************
-+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vabdq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vabdq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vabdq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(vabdq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vabdq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vabdq_u32(_pM128i(a), _pM128i(b)));
-+}
-+
-+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = vabdq_f32(_pM128(a), _pM128(b));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_sub_epi8 (a, b);
-+ return _mm_abs_epi8 (res);
-+}
-+
-+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_sub_epi16 (a,b);
-+ return _mm_abs_epi16 (res);
-+}
-+
-+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
-+{
-+ __m128i res;
-+ res = _mm_sub_epi32 (a,b);
-+ return _mm_abs_epi32 (res);
-+}
-+
-+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
-+{
-+ __m128i cmp, difab, difba;
-+ cmp = vcgtq_u8(a,b);
-+ difab = _mm_sub_epi8(a,b);
-+ difba = _mm_sub_epi8 (b,a);
-+ difab = _mm_and_si128(cmp, difab);
-+ difba = _mm_andnot_si128(cmp, difba);
-+ return _mm_or_si128(difab, difba);
-+}
-+
-+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
-+{
-+ __m128i cmp, difab, difba;
-+ cmp = vcgtq_u16(a,b);
-+ difab = _mm_sub_epi16(a,b);
-+ difba = _mm_sub_epi16 (b,a);
-+ difab = _mm_and_si128(cmp, difab);
-+ difba = _mm_andnot_si128(cmp, difba);
-+ return _mm_or_si128(difab, difba);
-+}
-+
-+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
-+{
-+ __m128i cmp, difab, difba;
-+ cmp = vcgtq_u32(a,b);
-+ difab = _mm_sub_epi32(a,b);
-+ difba = _mm_sub_epi32 (b,a);
-+ difab = _mm_and_si128(cmp, difab);
-+ difba = _mm_andnot_si128(cmp, difba);
-+ return _mm_or_si128(difab, difba);
-+}
-+
-+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
-+{
-+ __m128i c1;
-+ __m128 res;
-+ c1 = _mm_set1_epi32(0x7fffffff);
-+ res = _mm_sub_ps (a, b);
-+ return _mm_and_ps (res, *(__m128*)&c1);
-+}
-+
-+//************ Absolute difference - long **************************
-+//********************************************************************
-+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
-+{
-+ __m128i a16, b16;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ return vabdq_s16(a16, b16);
-+
-+}
-+
-+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
-+{
-+ __m128i a32, b32;
-+ a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-+ b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-+ return vabdq_s32(a32, b32);
-+}
-+
-+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //no optimal SIMD solution, serial looks faster
-+ _NEON2SSE_ALIGN_16 int64_t res[2];
-+ if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
-+ else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
-+ if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
-+ else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ __m128i res;
-+ res = vsubl_u8(a,b);
-+ return _mm_abs_epi16(res);
-+}
-+
-+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ __m128i res;
-+ res = vsubl_u16(a,b);
-+ return _mm_abs_epi32(res);
-+}
-+
-+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
-+ else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
-+ if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
-+ else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
-+//*********************************************************************************************
-+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c)
-+{
-+ int8x8_t res64;
-+ return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
-+}
-+
-+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c)
-+{
-+ int16x4_t res64;
-+ return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
-+}
-+
-+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c)
-+{
-+ int32x2_t res64;
-+ return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
-+}
-+
-+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
-+#define vaba_u8 vaba_s8
-+
-+
-+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
-+#define vaba_u16 vaba_s16
-+
-+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c)
-+{
-+ uint32x2_t res64;
-+ return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
-+}
-+
-+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
-+{
-+ int8x16_t sub;
-+ sub = vabdq_s8(b, c);
-+ return vaddq_s8( a, sub);
-+}
-+
-+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
-+{
-+ int16x8_t sub;
-+ sub = vabdq_s16(b, c);
-+ return vaddq_s16( a, sub);
-+}
-+
-+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
-+{
-+ int32x4_t sub;
-+ sub = vabdq_s32(b, c);
-+ return vaddq_s32( a, sub);
-+}
-+
-+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
-+{
-+ uint8x16_t sub;
-+ sub = vabdq_u8(b, c);
-+ return vaddq_u8( a, sub);
-+}
-+
-+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
-+{
-+ uint16x8_t sub;
-+ sub = vabdq_u16(b, c);
-+ return vaddq_u16( a, sub);
-+}
-+
-+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
-+{
-+ uint32x4_t sub;
-+ sub = vabdq_u32(b, c);
-+ return vaddq_u32( a, sub);
-+}
-+
-+//************** Absolute difference and accumulate - long ********************************
-+//*************************************************************************************
-+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
-+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
-+{
-+ __m128i b16, c16, res;
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-+ c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
-+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
-+ return _mm_add_epi16 (a, res);
-+}
-+
-+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
-+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
-+{
-+ __m128i b32, c32, res;
-+ b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
-+ c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
-+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
-+ return _mm_add_epi32 (a, res);
-+}
-+
-+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res;
-+ res = vabdl_s32(b,c);
-+ return _mm_add_epi64(a, res);
-+}
-+
-+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
-+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+ __m128i b16, c16, res;
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-+ c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
-+ res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
-+ return _mm_add_epi16 (a, res);
-+}
-+
-+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
-+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
-+{
-+ __m128i b32, c32, res;
-+ b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
-+ c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
-+ res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
-+ return _mm_add_epi32 (a, res);
-+}
-+
-+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ __m128i res;
-+ res = vabdl_u32(b,c);
-+ return _mm_add_epi64(a, res);
-+}
-+
-+//***********************************************************************************
-+//**************** Maximum and minimum operations **********************************
-+//***********************************************************************************
-+//************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *******
-+//***********************************************************************************
-+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i res;
-+ res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
-+ return64(res);
-+}
-+
-+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
-+ res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
-+ return res;
-+}
-+
-+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
-+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
-+
-+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
-+#define vmaxq_s16 _mm_max_epi16
-+
-+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
-+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
-+
-+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
-+#define vmaxq_u8 _mm_max_epu8
-+
-+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
-+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
-+
-+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
-+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
-+
-+
-+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
-+#define vmaxq_f32 _mm_max_ps
-+
-+//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
-+//***********************************************************************************************************
-+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-+ return64(res);
-+}
-+
-+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
-+{
-+ uint16x4_t res64;
-+ return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b)
-+{
-+ uint32x2_t res64;
-+ __m128i res;
-+ res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
-+ return64(res);
-+}
-+
-+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
-+ res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
-+ return res;
-+}
-+
-+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
-+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
-+
-+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
-+#define vminq_s16 _mm_min_epi16
-+
-+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
-+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
-+
-+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
-+#define vminq_u8 _mm_min_epu8
-+
-+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
-+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
-+
-+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
-+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
-+
-+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
-+#define vminq_f32 _mm_min_ps
-+
-+//************* Pairwise addition operations. **************************************
-+//************************************************************************************
-+//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
-+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit and then pack
-+ int8x8_t res64;
-+ __m128i a16, b16, res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
-+ res = _mm_hadd_epi16 (a16, b16);
-+ res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
-+ return64(res);
-+}
-+
-+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ __m128i hadd128;
-+ hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
-+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(hadd128);
-+}
-+
-+
-+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ __m128i hadd128;
-+ hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
-+ hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(hadd128);
-+}
-+
-+
-+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
-+{
-+ // no 8 bit hadd in IA32, need to go to 16 bit and then pack
-+ uint8x8_t res64;
-+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
-+ __m128i mask8, a16, b16, res;
-+ mask8 = _mm_set1_epi16(0xff);
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
-+ b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
-+ res = _mm_hadd_epi16 (a16, b16);
-+ res = _mm_and_si128(res, mask8); //to avoid saturation
-+ res = _mm_packus_epi16 (res,res); //use low 64 bits
-+ return64(res);
-+}
-+
-+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
-+{
-+ // solution may be not optimal, serial execution may be faster
-+ // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
-+ uint16x4_t res64;
-+ __m128i c32767, cfffe, as, bs, res;
-+ c32767 = _mm_set1_epi16 (32767);
-+ cfffe = _mm_set1_epi16 (0xfffe);
-+ as = _mm_sub_epi16 (_pM128i(a), c32767);
-+ bs = _mm_sub_epi16 (_pM128i(b), c32767);
-+ res = _mm_hadd_epi16 (as, bs);
-+ res = _mm_add_epi16 (res, cfffe);
-+ res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(res);
-+}
-+
-+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
-+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
-+{
-+ //hadd doesn't work for unsigned values
-+ uint32x2_t res64;
-+ __m128i ab, ab_sh, res;
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
-+ ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
-+ res = _mm_add_epi32(ab, ab_sh);
-+ res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ return64(res);
-+}
-+
-+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
-+{
-+ __m128 hadd128;
-+ __m64_128 res64;
-+ hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
-+ hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
-+ _M64f(res64, hadd128);
-+ return res64;
-+}
-+
-+
-+//************************** Long pairwise add **********************************
-+//*********************************************************************************
-+//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
-+// and places the final results in the destination vector.
-+
-+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
-+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit anyway
-+ __m128i a16;
-+ int16x4_t res64;
-+ a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
-+ return64(a16);
-+}
-+
-+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
-+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
-+{
-+ // solution may be not optimal, serial execution may be faster
-+ int32x2_t res64;
-+ __m128i r32_1;
-+ r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
-+ r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
-+ return64(r32_1);
-+}
-+
-+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
-+ return res;
-+}
-+
-+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
-+{
-+ // no 8 bit hadd in IA32, need to go to 16 bit
-+// no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
-+ uint16x4_t res64;
-+ __m128i a16;
-+ a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
-+ a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
-+ return64(a16);
-+}
-+
-+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than a SIMD one
-+ uint32x2_t res;
-+ res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
-+ res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
-+ return res;
-+}
-+
-+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
-+ return res;
-+}
-+
-+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
-+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit
-+ __m128i r16_1, r16_2;
-+ r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
-+ //swap hi and low part of r to process the remaining data
-+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
-+ return _mm_hadd_epi16 (r16_1, r16_2);
-+}
-+
-+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
-+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit
-+ __m128i r32_1, r32_2;
-+ r32_1 = _MM_CVTEPI16_EPI32(a);
-+ //swap hi and low part of r to process the remaining data
-+ r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
-+ return _mm_hadd_epi32 (r32_1, r32_2);
-+}
-+
-+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int32_t atmp[4];
-+ _NEON2SSE_ALIGN_16 int64_t res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
-+ res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
-+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
-+{
-+ //no 8 bit hadd in IA32, need to go to 16 bit
-+ __m128i r16_1, r16_2;
-+ r16_1 = _MM_CVTEPU8_EPI16(a);
-+ //swap hi and low part of r to process the remaining data
-+ r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
-+ return _mm_hadd_epi16 (r16_1, r16_2);
-+}
-+
-+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than a SIMD one
-+ _NEON2SSE_ALIGN_16 uint16_t atmp[8];
-+ _NEON2SSE_ALIGN_16 uint32_t res[4];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
-+ res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
-+ res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
-+ res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ _NEON2SSE_ALIGN_16 uint32_t atmp[4];
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
-+ res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//************************ Long pairwise add and accumulate **************************
-+//****************************************************************************************
-+//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
-+// and accumulates the values of the results into the elements of the destination (wide) vector
-+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
-+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b)
-+{
-+ int16x4_t res64;
-+ return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
-+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b)
-+{
-+ int32x2_t res64;
-+ return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
-+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
-+ return res;
-+}
-+
-+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b)
-+{
-+ uint16x4_t res64;
-+ return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0
-+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b)
-+{
-+ uint32x2_t res64;
-+ return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
-+}
-+
-+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
-+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
-+ return res;
-+}
-+
-+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
-+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
-+{
-+ int16x8_t pad;
-+ pad = vpaddlq_s8(b);
-+ return _mm_add_epi16 (a, pad);
-+}
-+
-+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
-+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
-+{
-+ int32x4_t pad;
-+ pad = vpaddlq_s16(b);
-+ return _mm_add_epi32(a, pad);
-+}
-+
-+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
-+_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
-+{
-+ int64x2_t pad;
-+ pad = vpaddlq_s32(b);
-+ return _mm_add_epi64 (a, pad);
-+}
-+
-+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
-+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
-+{
-+ uint16x8_t pad;
-+ pad = vpaddlq_u8(b);
-+ return _mm_add_epi16 (a, pad);
-+}
-+
-+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x4_t pad;
-+ pad = vpaddlq_u16(b);
-+ return _mm_add_epi32(a, pad);
-+} //no optimal SIMD solution, serial is faster
-+
-+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //no optimal SIMD solution, serial is faster
-+ uint64x2_t pad;
-+ pad = vpaddlq_u32(b);
-+ return _mm_add_epi64(a, pad);
-+} //no optimal SIMD solution, serial is faster
-+
-+//********** Folding maximum *************************************
-+//*******************************************************************
-+//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
-+//and copies the larger of each pair into the corresponding element in the destination
-+// no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
-+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
-+{
-+ int8x8_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-+ max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(max); //we need 64 bits only
-+}
-+
-+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ int16x4_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-+ max = _mm_max_epi16 (ab, ab1);
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(max);
-+}
-+
-+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ int32x2_t res;
-+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-+ return res;
-+}
-+
-+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
-+{
-+ uint8x8_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-+ max = _mm_max_epu8 (ab, ab1); // SSE4.1
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(max);
-+}
-+
-+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ uint16x4_t res64;
-+ __m128i ab, ab1, max;
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-+ max = _MM_MAX_EPU16 (ab, ab1);
-+ max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(max);
-+}
-+
-+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ uint32x2_t res;
-+ res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-+ res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-+ return res;
-+} //serial solution looks faster than a SIMD one
-+
-+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
-+ res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
-+ return res;
-+}
-+
-+// ***************** Folding minimum ****************************
-+// **************************************************************
-+//vpmin -> takes minimum of adjacent pairs
-+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
-+{
-+ int8x8_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
-+ min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(min);
-+}
-+
-+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ int16x4_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-+ min = _mm_min_epi16 (ab, ab1);
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(min);
-+}
-+
-+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ int32x2_t res;
-+ res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-+ res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-+ return res;
-+}
-+
-+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
-+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
-+{
-+ uint8x8_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-+ min = _mm_min_epu8 (ab, ab1); // SSE4.1
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
-+ return64(min);
-+}
-+
-+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
-+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
-+{
-+ //solution may be not optimal compared with the serial one
-+ uint16x4_t res64;
-+ __m128i ab, ab1, min;
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-+ _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-+ ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-+ ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
-+ min = _MM_MIN_EPU16 (ab, ab1);
-+ min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
-+ return64(min);
-+}
-+
-+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ uint32x2_t res;
-+ res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
-+ res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
-+ return res;
-+}
-+
-+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution looks faster than SIMD one
-+ float32x2_t res;
-+ res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
-+ res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
-+ return res;
-+}
-+
-+//***************************************************************
-+//*********** Reciprocal/Sqrt ************************************
-+//***************************************************************
-+//****************** Reciprocal estimate *******************************
-+//the ARM NEON and x86 SIMD results may be slightly different
-+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = _mm_rcp_ps(_pM128(a));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!! No reciprocal for ints in IA32 available
-+ uint32x2_t res;
-+ float resf, r;
-+ int i, q, s;
-+ for (i =0; i<2; i++){
-+ if((a.m64_u32[i] & 0x80000000) == 0) {
-+ res.m64_u32[i] = 0xffffffff;
-+ }else{
-+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
-+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
-+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res.m64_u32[i] = r * (uint32_t)(1 << 31);
-+ }
-+ }
-+ return res;
-+}
-+
-+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
-+#define vrecpeq_f32 _mm_rcp_ps
-+
-+
-+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!!
-+ //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
-+ _NEON2SSE_ALIGN_16 uint32_t atmp[4];
-+ _NEON2SSE_ALIGN_16 uint32_t res[4];
-+ _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
-+ float resf, r;
-+ int i, q, s;
-+ __m128i res128, mask, zero;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ zero = _mm_setzero_si128();
-+ for (i =0; i<4; i++){
-+ resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
-+ q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
-+ r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
-+ }
-+ res128 = _mm_load_si128((__m128i*)res);
-+ mask = _mm_and_si128(a, *(__m128i*)c80000000);
-+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff
-+ return _mm_or_si128(res128, mask);
-+}
-+
-+//**********Reciprocal square root estimate ****************
-+//**********************************************************
-+//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
-+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
-+////the ARM NEON and x86 SIMD results may be slightly different
-+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = _mm_rsqrt_ps(_pM128(a));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!!
-+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
-+ uint32x2_t res;
-+ __m128 tmp;
-+ float r, resf, coeff;
-+ int i,q0, q1, s;;
-+ for (i =0; i<2; i++){
-+ if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
-+ res.m64_u32[i] = 0xffffffff;
-+ }else{
-+ resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
-+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
-+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
-+ r = ((float)q0 + 0.5) / coeff;
-+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
-+ _mm_store_ss(&r, tmp);
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res.m64_u32[i] = r * (((uint32_t)1) << 31);
-+ }
-+ }
-+ return res;
-+}
-+
-+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
-+#define vrsqrteq_f32 _mm_rsqrt_ps
-+
-+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //Input is fixed point number!!!
-+ //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
-+ _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
-+ _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)};
-+ _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
-+ __m128 tmp;
-+ __m128i res128, mask, zero;
-+ float r, resf, coeff;
-+ int i,q0, q1, s;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ zero = _mm_setzero_si128();
-+ for (i =0; i<4; i++){
-+ resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
-+ coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
-+ q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
-+ r = ((float)q0 + 0.5) / coeff;
-+ tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
-+ _mm_store_ss(&r, tmp);
-+ s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
-+ r = (float)s / 256.0;
-+ res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
-+ }
-+ res128 = _mm_load_si128((__m128i*)res);
-+ mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
-+ mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff
-+ return _mm_or_si128(res128, mask);
-+}
-+//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
-+//******************************************************************************************
-+//******VRECPS (Vector Reciprocal Step) ***************************************************
-+//multiplies the elements of one vector by the corresponding elements of another vector,
-+//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
-+
-+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
-+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ res = vrecpsq_f32(_pM128(a), _pM128(b));
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
-+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
-+{
-+ __m128 f2, mul;
-+ f2 = _mm_set1_ps(2.);
-+ mul = _mm_mul_ps(a,b);
-+ return _mm_sub_ps(f2,mul);
-+}
-+
-+//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
-+//multiplies the elements of one vector by the corresponding elements of another vector,
-+//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
-+
-+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
-+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
-+ res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
-+ return res;
-+}
-+
-+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
-+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
-+{
-+ __m128 f3, f05, mul;
-+ f3 = _mm_set1_ps(3.);
-+ f05 = _mm_set1_ps(0.5);
-+ mul = _mm_mul_ps(a,b);
-+ f3 = _mm_sub_ps(f3,mul);
-+ return _mm_mul_ps (f3, f05);
-+}
-+//********************************************************************************************
-+//***************************** Shifts by signed variable ***********************************
-+//********************************************************************************************
-+//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
-+//********************************************************************************************
-+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
-+//helper macro. It matches ARM implementation for big shifts
-+#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
-+ else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
-+ for (i = 0; i<LEN; i++) { \
-+ if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
-+ else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
-+ return res;
-+
-+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(8, i, 8)
-+}
-+
-+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(16, i, 4)
-+}
-+
-+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(32, i, 2)
-+}
-+
-+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(64, i, 1)
-+}
-+
-+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(8, u, 8)
-+}
-+
-+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(16, u, 4)
-+}
-+
-+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT_64(32, u, 2)
-+}
-+
-+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
-+{
-+ SERIAL_SHIFT_64(64, u, 1)
-+}
-+
-+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int8_t, int8_t, 16, 16)
-+}
-+
-+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int16_t, int16_t, 8, 8)
-+}
-+
-+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int32_t, int32_t, 4, 4)
-+}
-+
-+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(int64_t, int64_t, 2, 2)
-+}
-+
-+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
-+}
-+
-+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
-+}
-+
-+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
-+}
-+
-+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
-+}
-+
-+
-+//*********** Vector saturating shift left: (negative values shift right) **********************
-+//********************************************************************************************
-+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
-+#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
-+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) res[i] = 0; \
-+ else{ \
-+ if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
-+ else{ \
-+ if (btmp[i]>lanesize_1) { \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
-+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ else res[i] = atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
-+ TYPE lanesize = (sizeof(TYPE) << 3); \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) {res[i] = 0; \
-+ }else{ \
-+ if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
-+ else{ \
-+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
-+ else{ \
-+ limit = (TYPE) 1 << (lanesize - btmp[i]); \
-+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
-+ int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
-+ else{ \
-+ if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize_1) { \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
-+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
-+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-+ return res;
-+
-+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
-+ int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
-+ }else{ \
-+ if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
-+ else{ \
-+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
-+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
-+ return res;
-+
-+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
-+}
-+
-+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
-+}
-+
-+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
-+}
-+
-+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
-+}
-+
-+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
-+}
-+
-+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
-+}
-+
-+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
-+}
-+
-+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
-+}
-+
-+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
-+}
-+
-+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
-+}
-+
-+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
-+}
-+
-+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
-+}
-+
-+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
-+}
-+
-+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
-+}
-+
-+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
-+}
-+
-+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
-+}
-+
-+
-+//******** Vector rounding shift left: (negative values shift right) **********
-+//****************************************************************************
-+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
-+//rounding makes sense for right shifts only.
-+#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if( btmp[i] >= 0) { \
-+ if(btmp[i] >= lanesize) res[i] = 0; \
-+ else res[i] = (atmp[i] << btmp[i]); \
-+ }else{ \
-+ res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
-+ (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
-+ (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+
-+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
-+ int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
-+ for (i = 0; i<LEN; i++) { \
-+ if( b.m64_i ## TYPE[i] >= 0) { \
-+ if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
-+ else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
-+ }else{ \
-+ res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
-+ (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
-+ (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
-+ return res;
-+
-+
-+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(8,i,8)
-+}
-+
-+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(16,i,4)
-+}
-+
-+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(32,i,2)
-+}
-+
-+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(64,i,1)
-+}
-+
-+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(8,u,8)
-+}
-+
-+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(16,u,4)
-+}
-+
-+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(32,u,2)
-+}
-+
-+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT_64(64,u,1)
-+}
-+
-+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
-+}
-+
-+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
-+}
-+
-+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
-+}
-+
-+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
-+}
-+
-+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
-+}
-+
-+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
-+}
-+
-+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
-+}
-+
-+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
-+}
-+
-+
-+//********** Vector saturating rounding shift left: (negative values shift right) ****************
-+//*************************************************************************************************
-+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
-+//Saturation happens for left shifts only while rounding makes sense for right shifts only.
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
-+ int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) res[i] = 0; \
-+ else{ \
-+ if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
-+ else{ \
-+ if (btmp[i]>lanesize_1) { \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
-+ if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
-+ res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-+ else res[i] = atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
-+ _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
-+ int lanesize = (sizeof(TYPE) << 3); \
-+ _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (atmp[i] ==0) {res[i] = 0; \
-+ }else{ \
-+ if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
-+ else{ \
-+ if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
-+ else{ \
-+ limit = (TYPE) 1 << (lanesize - btmp[i]); \
-+ res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
-+ return _mm_load_si128((__m128i*)res);
-+
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
-+ __m64_128 res; int ## TYPE ## _t limit; int i; \
-+ int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
-+ else{ \
-+ if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize_1) { \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
-+ }else{ \
-+ limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
-+ if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
-+ res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
-+ else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-+ return res;
-+
-+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
-+ __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
-+ int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
-+ for (i = 0; i<LEN; i++) { \
-+ if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
-+ }else{ \
-+ if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
-+ else{ \
-+ if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
-+ else{ \
-+ limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
-+ res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-+ return res;
-+
-+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
-+}
-+
-+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
-+}
-+
-+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
-+}
-+
-+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
-+}
-+
-+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
-+}
-+
-+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
-+}
-+
-+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
-+}
-+
-+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
-+}
-+
-+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
-+}
-+
-+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
-+}
-+
-+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
-+}
-+
-+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
-+}
-+
-+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
-+}
-+
-+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
-+}
-+
-+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
-+}
-+
-+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
-+}
-+
-+// *********************************************************************************
-+// ***************************** Shifts by a constant *****************************
-+// *********************************************************************************
-+//**************** Vector shift right by constant*************************************
-+//************************************************************************************
-+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ int8x8_t res64;
-+ __m128i r;
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_srai_epi16 (r, b); //SSE2
-+ r = _mm_packs_epi16 (r,r); //we need 64 bits only
-+ return64(r);
-+}
-+
-+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_srai_epi16(_pM128i(a), b));
-+}
-+
-+
-+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_srai_epi32(_pM128i(a), b));
-+}
-+
-+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //no arithmetic shift for 64bit values, serial solution used
-+ int64x1_t res;
-+ if(b>=64) res.m64_i64[0] = 0;
-+ else res.m64_i64[0] = (*(int64_t*)&a) >> b;
-+ return res;
-+}
-+
-+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ uint8x8_t res64;
-+ __m128i r;
-+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
-+ r = _mm_packus_epi16 (r,r); //we need 64 bits only
-+ return64(r);
-+}
-+
-+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
-+{
-+ uint16x4_t res64;
-+ return64(_mm_srli_epi16(_pM128i(a), b));
-+}
-+
-+
-+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res64;
-+ return64(_mm_srli_epi32(_pM128i(a), b));
-+}
-+
-+
-+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
-+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
-+{
-+ uint64x1_t res64;
-+ return64(_mm_srli_epi64(_pM128i(a), b));
-+}
-+
-+
-+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit trick
-+ __m128i zero, mask0, a_sign, r, a_sign_mask;
-+ _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff};
-+ zero = _mm_setzero_si128();
-+ mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
-+ a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
-+ r = _mm_srai_epi16 (a, b);
-+ a_sign_mask = _mm_and_si128 (mask0, a_sign);
-+ r = _mm_andnot_si128 (mask0, r);
-+ return _mm_or_si128 (r, a_sign_mask);
-+}
-+
-+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
-+#define vshrq_n_s16 _mm_srai_epi16
-+
-+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
-+#define vshrq_n_s32 _mm_srai_epi32
-+
-+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-+{
-+ //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
-+ __m128i c1, signmask,a0, res64;
-+ _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
-+ c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
-+ signmask = _mm_slli_epi64 (c1, (64 - b));
-+ a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
-+ a0 = _MM_CMPEQ_EPI64 (a, a0);
-+ signmask = _mm_and_si128(a0, signmask);
-+ res64 = _mm_srli_epi64 (a, b);
-+ return _mm_or_si128(res64, signmask);
-+}
-+
-+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
-+{
-+ //no 8 bit shift available, need the special trick
-+ __m128i mask0, r;
-+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00};
-+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
-+ r = _mm_srli_epi16 ( a, b);
-+ return _mm_and_si128 (r, mask0);
-+}
-+
-+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
-+#define vshrq_n_u16 _mm_srli_epi16
-+
-+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
-+#define vshrq_n_u32 _mm_srli_epi32
-+
-+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
-+#define vshrq_n_u64 _mm_srli_epi64
-+
-+//*************************** Vector shift left by constant *************************
-+//*********************************************************************************
-+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ int8x8_t res64;
-+ __m128i r;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_slli_epi16 (r, b); //SSE2
-+ r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
-+ return64(r);
-+}
-+
-+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_slli_epi16(_pM128i(a), b));
-+}
-+
-+
-+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_slli_epi32(_pM128i(a), b));
-+}
-+
-+
-+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b)
-+{
-+ int64x1_t res64;
-+ return64(_mm_slli_epi64(_pM128i(a), b));
-+}
-+
-+
-+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ uint8x8_t res64;
-+ __m128i mask8;
-+ __m128i r;
-+ mask8 = _mm_set1_epi16(0xff);
-+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r = _mm_slli_epi16 (r, b); //SSE2
-+ r = _mm_and_si128(r, mask8); //to avoid saturation
-+ r = _mm_packus_epi16 (r,r); //we need 64 bits only
-+ return64(r);
-+}
-+
-+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-+#define vshl_n_u16 vshl_n_s16
-+
-+
-+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-+#define vshl_n_u32 vshl_n_s32
-+
-+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-+#define vshl_n_u64 vshl_n_s64
-+
-+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+#define vshlq_n_s8 vshlq_n_u8
-+
-+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+#define vshlq_n_s16 _mm_slli_epi16
-+
-+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+#define vshlq_n_s32 _mm_slli_epi32
-+
-+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+#define vshlq_n_s64 _mm_slli_epi64
-+
-+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-+_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
-+{
-+ //no 8 bit shift available, need the special trick
-+ __m128i mask0, r;
-+ _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff};
-+ mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
-+ r = _mm_slli_epi16 ( a, b);
-+ return _mm_and_si128 (r, mask0);
-+}
-+
-+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-+#define vshlq_n_u16 vshlq_n_s16
-+
-+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-+#define vshlq_n_u32 vshlq_n_s32
-+
-+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-+#define vshlq_n_u64 vshlq_n_s64
-+
-+//************* Vector rounding shift right by constant ******************
-+//*************************************************************************
-+//No corresponding x86 intrinsics exist, need to do some tricks
-+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit
-+ int8x8_t res64;
-+ __m128i r, maskb;
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
-+ r = _mm_srai_epi16 (r, b);
-+ r = _mm_add_epi16 (r, maskb); //actual rounding
-+ r = _mm_packs_epi16 (r,r); ////we need 64 bits only
-+ return64(r);
-+}
-+
-+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b)
-+{
-+ int16x4_t res64;
-+ return64(vrshrq_n_s16(_pM128i(a), b));
-+}
-+
-+
-+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ return64(vrshrq_n_s32(_pM128i(a), b));
-+}
-+
-+
-+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ //serial solution is faster
-+ int64x1_t res;
-+ int64_t a_i64 = *( int64_t*)&a;
-+ if(b==64) {
-+ res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
-+ } else {
-+ int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
-+ res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
-+ uint8x8_t res64;
-+ __m128i r, maskb;
-+ r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
-+ r = _mm_srli_epi16 (r, b);
-+ r = _mm_add_epi16 (r, maskb); //actual rounding
-+ r = _mm_packus_epi16 (r,r); ////we need 64 bits only
-+ return64(r);
-+}
-+
-+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
-+{
-+ uint16x4_t res64;
-+ return64(vrshrq_n_u16(_pM128i(a), b));
-+}
-+
-+
-+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res64;
-+ return64(vrshrq_n_u32(_pM128i(a), b));
-+}
-+
-+
-+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
-+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
-+{
-+ uint64x1_t res64;
-+ return64(vrshrq_n_u64(_pM128i(a), b));
-+}
-+
-+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit trick
-+ __m128i r, mask1, maskb;
-+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
-+ r = vshrq_n_s8 (a, b);
-+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
-+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
-+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
-+ return _mm_add_epi8(r, maskb); //actual rounding
-+}
-+
-+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0
-+ r = _mm_srai_epi16 (a, b);
-+ return _mm_add_epi16 (r, maskb); //actual rounding
-+}
-+
-+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0
-+ r = _mm_srai_epi32(a, b);
-+ return _mm_add_epi32 (r, maskb); //actual rounding
-+}
-+
-+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-+{
-+ //solution may be not optimal compared with a serial one
-+ __m128i maskb;
-+ int64x2_t r;
-+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0
-+ r = vshrq_n_s64(a, b);
-+ return _mm_add_epi64 (r, maskb); //actual rounding
-+}
-+
-+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
-+{
-+ //no 8 bit shift available, go to 16 bit trick
-+ __m128i r, mask1, maskb;
-+ _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
-+ r = vshrq_n_u8 (a, b);
-+ mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
-+ maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
-+ maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
-+ return _mm_add_epi8(r, maskb); //actual rounding
-+}
-+
-+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
-+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi16(maskb, 15); //1 or 0
-+ r = _mm_srli_epi16 (a, b);
-+ return _mm_add_epi16 (r, maskb); //actual rounding
-+}
-+
-+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
-+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
-+{
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi32 (maskb,31); //1 or 0
-+ r = _mm_srli_epi32(a, b);
-+ return _mm_add_epi32 (r, maskb); //actual rounding
-+}
-+
-+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
-+_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
-+{
-+ //solution may be not optimal compared with a serial one
-+ __m128i maskb, r;
-+ maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
-+ maskb = _mm_srli_epi64 (maskb,63); //1 or 0
-+ r = _mm_srli_epi64(a, b);
-+ return _mm_add_epi64 (r, maskb); //actual rounding
-+}
-+
-+//************* Vector shift right by constant and accumulate *********
-+//*********************************************************************
-+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
-+{
-+ int8x8_t shift;
-+ shift = vshr_n_s8(b, c);
-+ return vadd_s8( a, shift);
-+}
-+
-+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
-+{
-+ int16x4_t shift;
-+ shift = vshr_n_s16( b, c);
-+ return vadd_s16(a, shift);
-+}
-+
-+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ int32x2_t shift;
-+ shift = vshr_n_s32(b, c);
-+ return vadd_s32( a, shift);
-+}
-+
-+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
-+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
-+{
-+ //may be not optimal compared with a serial solution
-+ int64x1_t shift;
-+ shift = vshr_n_s64(b, c);
-+ return vadd_s64( a, shift);
-+}
-+
-+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
-+{
-+ uint8x8_t shift;
-+ shift = vshr_n_u8(b, c);
-+ return vadd_u8(a, shift);
-+}
-+
-+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
-+{
-+ uint16x4_t shift;
-+ shift = vshr_n_u16(b, c);
-+ return vadd_u16(a,shift);
-+}
-+
-+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ uint32x2_t shift;
-+ shift = vshr_n_u32(b, c);
-+ return vadd_u32( a, shift);
-+}
-+
-+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
-+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
-+{
-+ //may be not optimal compared with the serial execution
-+ uint64x1_t shift;
-+ shift = vshr_n_u64(b, c);
-+ return vadd_u64(a, shift);
-+}
-+
-+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
-+{
-+ int8x16_t shift;
-+ shift = vshrq_n_s8(b, c);
-+ return vaddq_s8(a, shift);
-+}
-+
-+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
-+{
-+ int16x8_t shift;
-+ shift = vshrq_n_s16(b, c);
-+ return vaddq_s16(a, shift);
-+}
-+
-+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
-+{
-+ int32x4_t shift;
-+ shift = vshrq_n_s32(b, c);
-+ return vaddq_s32(a, shift);
-+}
-+
-+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
-+{
-+ int64x2_t shift;
-+ shift = vshrq_n_s64(b, c);
-+ return vaddq_s64( a, shift);
-+}
-+
-+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
-+{
-+ uint8x16_t shift;
-+ shift = vshrq_n_u8(b, c);
-+ return vaddq_u8(a, shift);
-+}
-+
-+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
-+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
-+{
-+ uint16x8_t shift;
-+ shift = vshrq_n_u16(b, c);
-+ return vaddq_u16(a, shift);
-+}
-+
-+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
-+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
-+{
-+ uint32x4_t shift;
-+ shift = vshrq_n_u32(b, c);
-+ return vaddq_u32(a, shift);
-+}
-+
-+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
-+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
-+{
-+ uint64x2_t shift;
-+ shift = vshrq_n_u64(b, c);
-+ return vaddq_u64(a, shift);
-+}
-+
-+//************* Vector rounding shift right by constant and accumulate ****************************
-+//************************************************************************************************
-+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
-+{
-+ int8x8_t shift;
-+ shift = vrshr_n_s8(b, c);
-+ return vadd_s8( a, shift);
-+}
-+
-+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
-+{
-+ int16x4_t shift;
-+ shift = vrshr_n_s16( b, c);
-+ return vadd_s16(a, shift);
-+}
-+
-+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ int32x2_t shift;
-+ shift = vrshr_n_s32(b, c);
-+ return vadd_s32( a, shift);
-+}
-+
-+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
-+{
-+ int64x1_t shift;
-+ shift = vrshr_n_s64(b, c);
-+ return vadd_s64( a, shift);
-+}
-+
-+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
-+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
-+{
-+ uint8x8_t shift;
-+ shift = vrshr_n_u8(b, c);
-+ return vadd_u8(a, shift);
-+}
-+
-+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
-+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
-+{
-+ uint16x4_t shift;
-+ shift = vrshr_n_u16(b, c);
-+ return vadd_u16(a,shift);
-+}
-+
-+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
-+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
-+{
-+ //may be not optimal compared with the serial execution
-+ uint32x2_t shift;
-+ shift = vrshr_n_u32(b, c);
-+ return vadd_u32( a, shift);
-+}
-+
-+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
-+{
-+ //may be not optimal compared with the serial execution
-+ uint64x1_t shift;
-+ shift = vrshr_n_u64(b, c);
-+ return vadd_u64( a, shift);
-+}
-+
-+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
-+{
-+ int8x16_t shift;
-+ shift = vrshrq_n_s8(b, c);
-+ return vaddq_s8(a, shift);
-+}
-+
-+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
-+{
-+ int16x8_t shift;
-+ shift = vrshrq_n_s16(b, c);
-+ return vaddq_s16(a, shift);
-+}
-+
-+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
-+{
-+ int32x4_t shift;
-+ shift = vrshrq_n_s32(b, c);
-+ return vaddq_s32(a, shift);
-+}
-+
-+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-+{
-+ int64x2_t shift;
-+ shift = vrshrq_n_s64(b, c);
-+ return vaddq_s64(a, shift);
-+}
-+
-+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
-+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
-+{
-+ uint8x16_t shift;
-+ shift = vrshrq_n_u8(b, c);
-+ return vaddq_u8(a, shift);
-+}
-+
-+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
-+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
-+{
-+ uint16x8_t shift;
-+ shift = vrshrq_n_u16(b, c);
-+ return vaddq_u16(a, shift);
-+}
-+
-+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
-+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
-+{
-+ uint32x4_t shift;
-+ shift = vrshrq_n_u32(b, c);
-+ return vaddq_u32(a, shift);
-+}
-+
-+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
-+_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
-+{
-+ uint64x2_t shift;
-+ shift = vrshrq_n_u64(b, c);
-+ return vaddq_u64(a, shift);
-+}
-+
-+//**********************Vector saturating shift left by constant *****************************
-+//********************************************************************************************
-+//we don't check const ranges assuming they are met
-+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
-+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
-+{
-+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
-+ int8x8_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi16 (a128, b);
-+ r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
-+ return64(r128);
-+}
-+
-+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
-+{
-+ // go to 32 bit to get the auto saturation (in packs function)
-+ int16x4_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi32 (a128, b); //shift_res
-+ r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
-+ return64(r128);
-+}
-+
-+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b)
-+{
-+ //serial execution may be faster
-+ int32x2_t res64;
-+ return64(vqshlq_n_s32 (_pM128i(a), b));
-+}
-+
-+
-+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ int64x1_t res;
-+ int64_t bmask;
-+ int64_t a_i64 = *( int64_t*)&a;
-+ bmask = ( int64_t)1 << (63 - b); //positive
-+ if (a_i64 >= bmask) {
-+ res.m64_i64[0] = ~(_SIGNBIT64);
-+ } else {
-+ res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
-+ }
-+ return res;
-+}
-+
-+
-+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
-+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
-+{
-+ //no 8 bit shift available in IA32 SIMD, go to 16 bit
-+ uint8x8_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi16 (a128, b); //shift_res
-+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
-+ return64(r128);
-+}
-+
-+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
-+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
-+{
-+ // go to 32 bit to get the auto saturation (in packus function)
-+ uint16x4_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi32 (a128, b); //shift_res
-+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
-+ return64(r128);
-+}
-+
-+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
-+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b)
-+{
-+ uint32x2_t res64;
-+ return64(vqshlq_n_u32(_pM128i(a), b));
-+}
-+
-+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ uint64x1_t res;
-+ uint64_t bmask;
-+ uint64_t a_i64 = *(uint64_t*)&a;
-+ bmask = ( uint64_t)1 << (64 - b);
-+ res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
-+ return res;
-+}
-+
-+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
-+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
-+{
-+ // go to 16 bit to get the auto saturation (in packs function)
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi16 (a128, b);
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI8_EPI16 (a128);
-+ r128_2 = _mm_slli_epi16 (a128, b);
-+ return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
-+}
-+
-+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
-+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
-+{
-+ // manual saturation solution looks LESS optimal than 32 bits conversion one
-+ // go to 32 bit to get the auto saturation (in packs function)
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI16_EPI32 (a128);
-+ r128_2 = _mm_slli_epi32 (a128, b);
-+ return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
-+}
-+
-+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
-+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
-+{
-+ // no 64 bit saturation option available, special tricks necessary
-+ __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
-+ c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
-+ maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
-+ saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise
-+ c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
-+ shift_res = _mm_slli_epi32 (a, b);
-+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
-+ //result with positive numbers saturated
-+ shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
-+ //treat negative numbers
-+ maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros
-+ saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise
-+ c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
-+ shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
-+ return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
-+}
-+
-+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
-+ int64_t bmask;
-+ int i;
-+ bmask = ( int64_t)1 << (63 - b); //positive
-+ _mm_store_si128((__m128i*)atmp, a);
-+ for (i = 0; i<2; i++) {
-+ if (atmp[i] >= bmask) {
-+ res[i] = ~(_SIGNBIT64);
-+ } else {
-+ res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
-+ }
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
-+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
-+{
-+ // go to 16 bit to get the auto saturation (in packs function)
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi16 (a128, b);
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPU8_EPI16 (a128);
-+ r128_2 = _mm_slli_epi16 (a128, b);
-+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
-+}
-+
-+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
-+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
-+{
-+ // manual saturation solution looks more optimal than 32 bits conversion one
-+ __m128i cb, c8000, a_signed, saturation_mask, shift_res;
-+ cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
-+ c8000 = _mm_set1_epi16 (0x8000);
-+//no unsigned shorts comparison in SSE, only signed available, so need the trick
-+ a_signed = _mm_sub_epi16(a, c8000); //go to signed
-+ saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
-+ shift_res = _mm_slli_epi16 (a, b);
-+ return _mm_or_si128 (shift_res, saturation_mask);
-+}
-+
-+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
-+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
-+{
-+ // manual saturation solution, no 64 bit saturation option, the serial version may be faster
-+ __m128i cb, c80000000, a_signed, saturation_mask, shift_res;
-+ cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
-+ c80000000 = _mm_set1_epi32 (0x80000000);
-+//no unsigned ints comparison in SSE, only signed available, so need the trick
-+ a_signed = _mm_sub_epi32(a, c80000000); //go to signed
-+ saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
-+ shift_res = _mm_slli_epi32 (a, b);
-+ return _mm_or_si128 (shift_res, saturation_mask);
-+}
-+
-+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here
-+ _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
-+ uint64_t bmask;
-+ int i;
-+ bmask = ( uint64_t)1 << (64 - b);
-+ _mm_store_si128((__m128i*)atmp, a);
-+ for (i = 0; i<2; i++) {
-+ res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//**************Vector signed->unsigned saturating shift left by constant *************
-+//*************************************************************************************
-+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
-+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
-+{
-+ //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
-+ uint8x8_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi16 (a128, b);
-+ r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
-+ return64(r128);
-+}
-+
-+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
-+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
-+{
-+ uint16x4_t res64;
-+ __m128i a128, r128;
-+ a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
-+ r128 = _mm_slli_epi32 (a128, b); //shift_res
-+ r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
-+ return64(r128);
-+}
-+
-+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b)
-+{
-+ int32x2_t res64;
-+ return64( vqshluq_n_s32(_pM128i(a), b));
-+}
-+
-+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
-+{
-+ uint64x1_t res;
-+ uint64_t limit;
-+ if (a.m64_i64[0]<=0) {
-+ res.m64_u64[0] = 0;
-+ } else {
-+ limit = (uint64_t) 1 << (64 - b);
-+ res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
-+ }
-+ return res;
-+}
-+
-+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
-+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
-+{
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi16 (a128, b);
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI8_EPI16 (a128);
-+ r128_2 = _mm_slli_epi16 (a128, b);
-+ return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
-+}
-+
-+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
-+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
-+{
-+ // manual saturation solution looks LESS optimal than 32 bits conversion one
-+ __m128i a128, r128_1, r128_2;
-+ a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
-+ r128_1 = _mm_slli_epi32 (a128, b); //shift_res
-+ //swap hi and low part of a128 to process the remaining data
-+ a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-+ a128 = _MM_CVTEPI16_EPI32 (a128);
-+ r128_2 = _mm_slli_epi32 (a128, b);
-+ return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
-+}
-+
-+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
-+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
-+{
-+ //solution may be not optimal compared with the serial one
-+ __m128i zero, maskA, maskGT0, a0, a_masked, a_shift;
-+ zero = _mm_setzero_si128();
-+ maskA = _mm_cmpeq_epi32(a, a);
-+ maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
-+ //saturate negative numbers to zero
-+ maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
-+ a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now
-+ //saturate positive to 0xffffffff
-+ a_masked = _mm_and_si128 (a0, maskA);
-+ a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
-+ a_shift = _mm_slli_epi32 (a0, b);
-+ return _mm_or_si128 (a_shift, a_masked); //actual saturation
-+}
-+
-+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ // no effective SIMD solution here, serial execution looks faster
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ _NEON2SSE_ALIGN_16 uint64_t res[2];
-+ uint64_t limit;
-+ int i;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ for (i = 0; i<2; i++) {
-+ if (atmp[i]<=0) {
-+ res[i] = 0;
-+ } else {
-+ limit = (uint64_t) 1 << (64 - b);
-+ res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
-+ }
-+ }
-+ return _mm_load_si128((__m128i*)res);
-+}
-+
-+//************** Vector narrowing shift right by constant **************
-+//**********************************************************************
-+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r16 = vshrq_n_s16(a,b);
-+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r16);
-+}
-+
-+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r32 = vshrq_n_s32(a,b);
-+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r32);
-+}
-+
-+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ __m128i r64;
-+ r64 = vshrq_n_s64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i mask, r16;
-+ mask = _mm_set1_epi16(0xff);
-+ r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_and_si128(r16, mask); //to avoid saturation
-+ r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i mask, r32;
-+ mask = _mm_set1_epi32(0xffff);
-+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
-+ r32 = _mm_and_si128(r32, mask); //to avoid saturation
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res64;
-+ __m128i r64;
-+ r64 = vshrq_n_u64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+//************** Vector signed->unsigned narrowing saturating shift right by constant ********
-+//*********************************************************************************************
-+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i r16;
-+ r16 = vshrq_n_s16(a,b);
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i r32;
-+ r32 = vshrq_n_s32(a,b);
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
-+{
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ uint32x2_t res;
-+ int64_t res64;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ if (atmp[0] < 0) {
-+ res.m64_u32[0] = 0;
-+ } else {
-+ res64 = (atmp[0] >> b);
-+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
-+ }
-+ if (atmp[1] < 0) {
-+ res.m64_u32[1] = 0;
-+ } else {
-+ res64 = (atmp[1] >> b);
-+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
-+ }
-+ return res;
-+}
-+
-+//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
-+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
-+{
-+ //solution may be not optimal compared with the serial one
-+ __m128i r16;
-+ uint8x8_t res64;
-+ r16 = vrshrq_n_s16(a,b);
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
-+{
-+ //solution may be not optimal compared with the serial one
-+ __m128i r32;
-+ uint16x4_t res64;
-+ r32 = vrshrq_n_s32(a,b);
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
-+{
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ uint32x2_t res;
-+ int64_t res64;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ if (atmp[0] < 0) {
-+ res.m64_u32[0] = 0;
-+ } else {
-+ res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
-+ res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
-+ }
-+ if (atmp[1] < 0) {
-+ res.m64_u32[1] = 0;
-+ } else {
-+ res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
-+ res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
-+ }
-+ return res;
-+}
-+
-+//***** Vector narrowing saturating shift right by constant ******
-+//*****************************************************************
-+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ r16 = vshrq_n_s16(a,b);
-+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ r32 = vshrq_n_s32(a,b);
-+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //no optimal SIMD solution found
-+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
-+ int32x2_t res;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ res64[0] = (atmp[0] >> b);
-+ res64[1] = (atmp[1] >> b);
-+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
-+ if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
-+ if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
-+ if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
-+ res.m64_i32[0] = (int32_t)res64[0];
-+ res.m64_i32[1] = (int32_t)res64[1];
-+ return res;
-+}
-+
-+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i r16;
-+ r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i r32;
-+ r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-+{
-+ //serial solution may be faster
-+ uint32x2_t res64;
-+ __m128i r64, res_hi, zero;
-+ zero = _mm_setzero_si128();
-+ r64 = vshrq_n_u64(a,b);
-+ res_hi = _mm_srli_epi64(r64, 32);
-+ res_hi = _mm_cmpgt_epi32(res_hi, zero);
-+ r64 = _mm_or_si128(r64, res_hi);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+
-+//********* Vector rounding narrowing shift right by constant *************************
-+//****************************************************************************************
-+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r16 = vrshrq_n_s16(a,b);
-+ r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r16);
-+}
-+
-+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ r32 = vrshrq_n_s32(a,b);
-+ r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-+ return64(r32);
-+}
-+
-+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ __m128i r64;
-+ r64 = vrshrq_n_s64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i mask, r16;
-+ mask = _mm_set1_epi16(0xff);
-+ r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_and_si128(r16, mask); //to avoid saturation
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i mask, r32;
-+ mask = _mm_set1_epi32(0xffff);
-+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-+ r32 = _mm_and_si128(r32, mask); //to avoid saturation
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
-+{
-+ uint32x2_t res64;
-+ __m128i r64;
-+ r64 = vrshrq_n_u64(a,b);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+//************* Vector rounding narrowing saturating shift right by constant ************
-+//****************************************************************************************
-+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
-+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
-+{
-+ int8x8_t res64;
-+ __m128i r16;
-+ r16 = vrshrq_n_s16(a,b);
-+ r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
-+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
-+{
-+ int16x4_t res64;
-+ __m128i r32;
-+ r32 = vrshrq_n_s32(a,b);
-+ r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //no optimal SIMD solution found
-+ _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
-+ int32x2_t res;
-+ _mm_store_si128((__m128i*)atmp, a);
-+ maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
-+ res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
-+ maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
-+ res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
-+ if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
-+ if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
-+ if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
-+ if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
-+ res.m64_i32[0] = (int32_t)res64[0];
-+ res.m64_i32[1] = (int32_t)res64[1];
-+ return res;
-+}
-+
-+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
-+_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
-+{
-+ uint8x8_t res64;
-+ __m128i r16;
-+ r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-+ r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
-+ return64(r16);
-+}
-+
-+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
-+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
-+{
-+ uint16x4_t res64;
-+ __m128i r32;
-+ r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-+ r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
-+ return64(r32);
-+}
-+
-+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
-+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-+{
-+ //serial solution may be faster
-+ uint32x2_t res64;
-+ __m128i r64, res_hi, zero;
-+ zero = _mm_setzero_si128();
-+ r64 = vrshrq_n_u64(a,b);
-+ res_hi = _mm_srli_epi64(r64, 32);
-+ res_hi = _mm_cmpgt_epi32(res_hi, zero);
-+ r64 = _mm_or_si128(r64, res_hi);
-+ r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(r64);
-+}
-+
-+//************** Vector widening shift left by constant ****************
-+//************************************************************************
-+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
-+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
-+{
-+ __m128i r;
-+ r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-+ return _mm_slli_epi16 (r, b);
-+}
-+
-+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
-+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
-+{
-+ __m128i r;
-+ r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
-+ return _mm_slli_epi32 (r, b);
-+}
-+
-+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
-+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
-+{
-+ __m128i r;
-+ r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
-+ return _mm_slli_epi64 (r, b);
-+}
-+
-+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
-+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
-+{
-+ //no uint8 to uint16 conversion available, manual conversion used
-+ __m128i zero, r;
-+ zero = _mm_setzero_si128 ();
-+ r = _mm_unpacklo_epi8(_pM128i(a), zero);
-+ return _mm_slli_epi16 (r, b);
-+}
-+
-+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
-+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
-+{
-+ //no uint16 to uint32 conversion available, manual conversion used
-+ __m128i zero, r;
-+ zero = _mm_setzero_si128 ();
-+ r = _mm_unpacklo_epi16(_pM128i(a), zero);
-+ return _mm_slli_epi32 (r, b);
-+}
-+
-+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
-+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
-+{
-+ //no uint32 to uint64 conversion available, manual conversion used
-+ __m128i zero, r;
-+ zero = _mm_setzero_si128 ();
-+ r = _mm_unpacklo_epi32(_pM128i(a), zero);
-+ return _mm_slli_epi64 (r, b);
-+}
-+
-+//************************************************************************************
-+//**************************** Shifts with insert ************************************
-+//************************************************************************************
-+//takes each element in a vector, shifts them by an immediate value,
-+//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
-+
-+//**************** Vector shift right and insert ************************************
-+//Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
-+//All other bits are taken from b shifted.
-+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c)
-+{
-+ int8x8_t res64;
-+ return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c)
-+{
-+ int16x4_t res64;
-+ return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c)
-+{
-+ int32x2_t res64;
-+ return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
-+{
-+ int64x1_t res;
-+ if (c ==64)
-+ res = a;
-+ else{
-+ res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+#define vsri_n_u8 vsri_n_s8
-+
-+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+#define vsri_n_u16 vsri_n_s16
-+
-+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-+#define vsri_n_u32 vsri_n_s32
-+
-+
-+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-+#define vsri_n_u64 vsri_n_s64
-+
-+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-+#define vsri_n_p8 vsri_n_u8
-+
-+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-+#define vsri_n_p16 vsri_n_u16
-+
-+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
-+{
-+ __m128i maskA, a_masked;
-+ uint8x16_t b_shift;
-+ _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
-+ maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
-+ a_masked = _mm_and_si128 (a, maskA);
-+ b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
-+ return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
-+}
-+
-+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
-+{
-+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
-+ uint16x8_t b_shift;
-+ uint16x8_t a_c;
-+ b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
-+ a_c = vshrq_n_u16( a, (16 - c));
-+ a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
-+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-+}
-+
-+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
-+{
-+ //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
-+ uint32x4_t b_shift;
-+ uint32x4_t a_c;
-+ b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
-+ a_c = vshrq_n_u32( a, (32 - c));
-+ a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
-+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-+}
-+
-+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-+{
-+ //serial solution may be faster
-+ uint64x2_t b_shift;
-+ uint64x2_t a_c;
-+ b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
-+ a_c = _mm_srli_epi64(a, (64 - c));
-+ a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
-+ return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-+}
-+
-+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+#define vsriq_n_u8 vsriq_n_s8
-+
-+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+#define vsriq_n_u16 vsriq_n_s16
-+
-+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-+#define vsriq_n_u32 vsriq_n_s32
-+
-+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-+#define vsriq_n_u64 vsriq_n_s64
-+
-+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-+#define vsriq_n_p8 vsriq_n_u8
-+
-+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-+#define vsriq_n_p16 vsriq_n_u16
-+
-+//***** Vector shift left and insert *********************************************
-+//*********************************************************************************
-+//Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
-+//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
-+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c)
-+{
-+ int8x8_t res64;
-+ return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c)
-+{
-+ int16x4_t res64;
-+ return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
-+}
-+
-+
-+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c)
-+{
-+ int32x2_t res64;
-+ return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
-+}
-+
-+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
-+ return res;
-+}
-+
-+
-+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+#define vsli_n_u8 vsli_n_s8
-+
-+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+#define vsli_n_u16 vsli_n_s16
-+
-+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-+#define vsli_n_u32 vsli_n_s32
-+
-+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-+#define vsli_n_u64 vsli_n_s64
-+
-+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-+#define vsli_n_p8 vsli_n_u8
-+
-+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-+#define vsli_n_p16 vsli_n_u16
-+
-+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
-+{
-+ __m128i maskA, a_masked;
-+ int8x16_t b_shift;
-+ _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
-+ maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
-+ b_shift = vshlq_n_s8( b, c);
-+ a_masked = _mm_and_si128 (a, maskA);
-+ return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
-+}
-+
-+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
-+{
-+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
-+ int16x8_t b_shift;
-+ int16x8_t a_c;
-+ b_shift = vshlq_n_s16( b, c);
-+ a_c = vshlq_n_s16( a, (16 - c));
-+ a_c = _mm_srli_epi16(a_c, (16 - c));
-+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-+}
-+
-+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
-+{
-+ //solution may be not optimal compared with the serial one
-+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
-+ int32x4_t b_shift;
-+ int32x4_t a_c;
-+ b_shift = vshlq_n_s32( b, c);
-+ a_c = vshlq_n_s32( a, (32 - c));
-+ a_c = _mm_srli_epi32(a_c, (32 - c));
-+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-+}
-+
-+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
-+{
-+ //solution may be not optimal compared with the serial one
-+ //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
-+ int64x2_t b_shift;
-+ int64x2_t a_c;
-+ b_shift = vshlq_n_s64( b, c);
-+ a_c = vshlq_n_s64( a, (64 - c));
-+ a_c = _mm_srli_epi64(a_c, (64 - c));
-+ return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-+}
-+
-+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+#define vsliq_n_u8 vsliq_n_s8
-+
-+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+#define vsliq_n_u16 vsliq_n_s16
-+
-+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-+#define vsliq_n_u32 vsliq_n_s32
-+
-+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-+#define vsliq_n_u64 vsliq_n_s64
-+
-+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-+#define vsliq_n_p8 vsliq_n_u8
-+
-+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-+#define vsliq_n_p16 vsliq_n_u16
-+
-+// ***********************************************************************************************
-+// ****************** Loads and stores of a single vector ***************************************
-+// ***********************************************************************************************
-+//Performs loads and stores of a single vector of some type.
-+//******************************* Loads ********************************************************
-+// ***********************************************************************************************
-+//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
-+//also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
-+// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
-+//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
-+#define LOAD_SI128(ptr) \
-+ ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
-+
-+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+#define vld1q_u8 LOAD_SI128
-+
-+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+#define vld1q_u16 LOAD_SI128
-+
-+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+#define vld1q_u32 LOAD_SI128
-+
-+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld1q_u64 LOAD_SI128
-+
-+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+#define vld1q_s8 LOAD_SI128
-+
-+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+#define vld1q_s16 LOAD_SI128
-+
-+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+#define vld1q_s32 LOAD_SI128
-+
-+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld1q_s64 LOAD_SI128
-+
-+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
-+/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
-+{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
-+__m128 f2;
-+f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
-+}*/
-+
-+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
-+{
-+ if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned
-+ return _mm_load_ps(ptr);
-+ else
-+ return _mm_loadu_ps(ptr);
-+}
-+
-+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-+#define vld1q_p8 LOAD_SI128
-+
-+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-+#define vld1q_p16 LOAD_SI128
-+
-+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
-+#define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
-+
-+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
-+#define vld1_u16 vld1_u8
-+
-+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
-+#define vld1_u32 vld1_u8
-+
-+
-+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1_u64 vld1_u8
-+
-+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
-+#define vld1_s8 vld1_u8
-+
-+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
-+#define vld1_s16 vld1_u16
-+
-+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
-+#define vld1_s32 vld1_u32
-+
-+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1_s64 vld1_u64
-+
-+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
-+
-+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
-+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = *(ptr);
-+ res.m64_f32[1] = *(ptr + 1);
-+ return res;
-+}
-+
-+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
-+#define vld1_p8 vld1_u8
-+
-+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
-+#define vld1_p16 vld1_u16
-+
-+//***********************************************************************************************************
-+//******* Lane load functions - insert the data at vector's given position (lane) *************************
-+//***********************************************************************************************************
-+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-+
-+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-+
-+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
-+
-+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
-+
-+
-+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-+
-+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-+
-+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
-+
-+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
-+{
-+ //we need to deal with ptr 16bit NOT aligned case
-+ __m128 p;
-+ p = _mm_set1_ps(*(ptr));
-+ return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane));
-+}
-+
-+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-+#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
-+
-+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-+
-+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-+
-+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint8x8_t res;
-+ res = vec;
-+ res.m64_u8[lane] = *(ptr);
-+ return res;
-+}
-+
-+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint16x4_t res;
-+ res = vec;
-+ res.m64_u16[lane] = *(ptr);
-+ return res;
-+}
-+
-+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint32x2_t res;
-+ res = vec;
-+ res.m64_u32[lane] = *(ptr);
-+ return res;
-+}
-+
-+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
-+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = *(ptr);
-+ return res;
-+}
-+
-+
-+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
-+
-+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
-+
-+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
-+
-+float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
-+{
-+ float32x2_t res;
-+ res = vec;
-+ res.m64_f32[lane] = *(ptr);
-+ return res;
-+}
-+
-+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
-+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
-+
-+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-+#define vld1_lane_p8 vld1_lane_u8
-+
-+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-+#define vld1_lane_p16 vld1_lane_s16
-+
-+// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
-+// ******************************************************************************************************************
-+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
-+
-+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
-+
-+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
-+
-+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+_NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
-+ return LOAD_SI128(val);
-+}
-+
-+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
-+
-+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
-+
-+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
-+
-+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
-+
-+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+//current IA SIMD doesn't support float16, need to go to 32 bits
-+
-+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
-+
-+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
-+
-+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
-+
-+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint8x8_t res;
-+ int i;
-+ for(i = 0; i<8; i++) {
-+ res.m64_u8[i] = *(ptr);
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint16x4_t res;
-+ int i;
-+ for(i = 0; i<4; i++) {
-+ res.m64_u16[i] = *(ptr);
-+ }
-+ return res;
-+}
-+
-+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = *(ptr);
-+ res.m64_u32[1] = *(ptr);
-+ return res;
-+}
-+
-+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = *(ptr);
-+ return res;
-+}
-+
-+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
-+
-+
-+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
-+
-+
-+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
-+
-+
-+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
-+
-+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = *(ptr);
-+ res.m64_f32[1] = res.m64_f32[0];
-+ return res; // use last 64bits only
-+}
-+
-+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-+#define vld1_dup_p8 vld1_dup_u8
-+
-+
-+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-+#define vld1_dup_p16 vld1_dup_u16
-+
-+
-+//*************************************************************************************
-+//********************************* Store **********************************************
-+//*************************************************************************************
-+// If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
-+//here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
-+#define STORE_SI128(ptr, val) \
-+ (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
-+
-+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
-+#define vst1q_u8 STORE_SI128
-+
-+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
-+#define vst1q_u16 STORE_SI128
-+
-+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
-+#define vst1q_u32 STORE_SI128
-+
-+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
-+#define vst1q_u64 STORE_SI128
-+
-+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
-+#define vst1q_s8 STORE_SI128
-+
-+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
-+#define vst1q_s16 STORE_SI128
-+
-+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
-+#define vst1q_s32 STORE_SI128
-+
-+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
-+#define vst1q_s64 STORE_SI128
-+
-+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
-+{
-+ if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned
-+ _mm_store_ps (ptr, val);
-+ else
-+ _mm_storeu_ps (ptr, val);
-+}
-+
-+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
-+#define vst1q_p8 vst1q_u8
-+
-+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
-+#define vst1q_p16 vst1q_u16
-+
-+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
-+{
-+ int i;
-+ for (i = 0; i<8; i++) {
-+ *(ptr + i) = ((uint8_t*)&val)[i];
-+ }
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
-+{
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ *(ptr + i) = ((uint16_t*)&val)[i];
-+ }
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
-+{
-+ int i;
-+ for (i = 0; i<2; i++) {
-+ *(ptr + i) = ((uint32_t*)&val)[i];
-+ }
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
-+{
-+ *(ptr) = *((uint64_t*)&val);
-+ //_mm_storel_epi64((__m128i*)ptr, val);
-+ return;
-+}
-+
-+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
-+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
-+
-+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
-+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
-+
-+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
-+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
-+
-+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
-+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
-+
-+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
-+{
-+ *(ptr) = val.m64_f32[0];
-+ *(ptr + 1) = val.m64_f32[1];
-+ return;
-+}
-+
-+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
-+#define vst1_p8 vst1_u8
-+
-+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
-+#define vst1_p16 vst1_u16
-+
-+//***********Store a lane of a vector into memory (extract given lane) *********************
-+//******************************************************************************************
-+void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
-+
-+void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
-+
-+void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-+#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
-+
-+void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
-+#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
-+
-+void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
-+
-+void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
-+
-+void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-+#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
-+
-+void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
-+#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
-+
-+void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
-+{
-+ int32_t ilane;
-+ ilane = _MM_EXTRACT_PS(val,lane);
-+ *(ptr) = *((float*)&ilane);
-+}
-+
-+void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1q_lane_p8 vst1q_lane_u8
-+
-+void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1q_lane_p16 vst1q_lane_s16
-+
-+void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
-+{
-+ *(ptr) = val.m64_u8[lane];
-+}
-+
-+void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val.m64_u16[lane];
-+}
-+
-+void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val.m64_u32[lane];
-+}
-+
-+void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
-+{
-+ *(ptr) = val.m64_u64[0];
-+}
-+
-+void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
-+
-+void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
-+
-+void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-+#define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
-+
-+
-+void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
-+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
-+
-+
-+void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val.m64_f32[lane];
-+}
-+
-+void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-+#define vst1_lane_p8 vst1_lane_u8
-+
-+void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-+#define vst1_lane_p16 vst1_lane_s16
-+
-+//***********************************************************************************************
-+//**************** Loads and stores of an N-element structure **********************************
-+//***********************************************************************************************
-+//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
-+//We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
-+//****************** 2 elements load *********************************************
-+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
-+{
-+ uint8x16x2_t v;
-+ v.val[0] = vld1q_u8(ptr);
-+ v.val[1] = vld1q_u8((ptr + 16));
-+ v = vuzpq_s8(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
-+{
-+ uint16x8x2_t v;
-+ v.val[0] = vld1q_u16( ptr);
-+ v.val[1] = vld1q_u16( (ptr + 8));
-+ v = vuzpq_s16(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
-+{
-+ uint32x4x2_t v;
-+ v.val[0] = vld1q_u32 ( ptr);
-+ v.val[1] = vld1q_u32 ( (ptr + 4));
-+ v = vuzpq_s32(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
-+#define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
-+
-+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
-+
-+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
-+
-+
-+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
-+{
-+ float32x4x2_t v;
-+ v.val[0] = vld1q_f32 (ptr);
-+ v.val[1] = vld1q_f32 ((ptr + 4));
-+ v = vuzpq_f32(v.val[0], v.val[1]);
-+ return v;
-+}
-+
-+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-+#define vld2q_p8 vld2q_u8
-+
-+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-+#define vld2q_p16 vld2q_u16
-+
-+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
-+{
-+ uint8x8x2_t v;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-+ __m128i ld128;
-+ ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
-+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
-+ vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
-+ return v;
-+}
-+
-+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
-+{
-+ _NEON2SSE_ALIGN_16 uint16x4x2_t v;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-+ __m128i ld128;
-+ ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
-+ ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
-+ vst1q_u16((v.val), ld128);
-+ return v;
-+}
-+
-+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
-+{
-+ _NEON2SSE_ALIGN_16 uint32x2x2_t v;
-+ __m128i ld128;
-+ ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
-+ ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-+ vst1q_u32((v.val), ld128);
-+ return v;
-+}
-+
-+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
-+{
-+ uint64x1x2_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ return v;
-+}
-+
-+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
-+
-+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
-+
-+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
-+
-+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
-+
-+float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
-+
-+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
-+{
-+ float32x2x2_t v;
-+ v.val[0].m64_f32[0] = *(ptr);
-+ v.val[0].m64_f32[1] = *(ptr + 2);
-+ v.val[1].m64_f32[0] = *(ptr + 1);
-+ v.val[1].m64_f32[1] = *(ptr + 3);
-+ return v;
-+}
-+
-+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-+#define vld2_p8 vld2_u8
-+
-+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-+#define vld2_p16 vld2_u16
-+
-+//******************** Triplets ***************************************
-+//*********************************************************************
-+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
-+{
-+ //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
-+ //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
-+ //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
-+ //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
-+ uint8x16x3_t v;
-+ __m128i tmp0, tmp1,tmp2, tmp3;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
-+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
-+ _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
-+
-+ v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
-+ v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
-+ v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
-+
-+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
-+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
-+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
-+
-+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
-+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
-+ tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
-+ tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
-+ v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
-+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
-+
-+ tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
-+ tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
-+ v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
-+ v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
-+ v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
-+ v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
-+ tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
-+ tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
-+
-+ tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
-+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
-+ v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
-+ v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
-+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
-+ tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
-+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
-+ return v;
-+}
-+
-+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
-+{
-+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
-+ uint16x8x3_t v;
-+ __m128i tmp0, tmp1,tmp2, tmp3;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
-+ _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
-+ _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
-+
-+ v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
-+ v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
-+ v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
-+
-+ tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
-+ tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
-+ tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
-+
-+ tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
-+ tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
-+ tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
-+ tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
-+ v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
-+ v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
-+
-+ tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
-+ tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
-+ v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
-+ v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
-+ v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
-+ v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
-+ tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
-+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
-+ v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
-+
-+ tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
-+ tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
-+ v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
-+ v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
-+ v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
-+ tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
-+ v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
-+ return v;
-+}
-+
-+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
-+ uint32x4x3_t v;
-+ __m128i tmp0, tmp1,tmp2, tmp3;
-+ v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3,
-+ v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
-+ v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
-+
-+ tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
-+ tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
-+ tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
-+
-+ tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
-+ v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
-+ tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
-+ v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
-+ v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
-+ v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
-+ return v;
-+}
-+
-+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+#define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
-+
-+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+#define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
-+
-+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+#define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
-+
-+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
-+ float32x4x3_t v;
-+ __m128 tmp0, tmp1,tmp2, tmp3;
-+ v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3,
-+ v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
-+ v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
-+
-+ tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
-+ tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
-+ tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
-+ tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
-+
-+ v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
-+ tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
-+ v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
-+ v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
-+ v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
-+ return v;
-+}
-+
-+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-+#define vld3q_p8 vld3q_u8
-+
-+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-+#define vld3q_p16 vld3q_u16
-+
-+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
-+{
-+ //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
-+ uint8x8x3_t v;
-+ __m128i val0, val1, val2, tmp0, tmp1;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
-+ _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
-+ val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
-+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
-+
-+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
-+ tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
-+ val0 = _mm_slli_si128(tmp0,10);
-+ val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
-+ val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
-+ val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
-+ _M64(v.val[0], val0);
-+ val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
-+ val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
-+ val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
-+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
-+ val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
-+ _M64(v.val[1], val1);
-+
-+ tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
-+ val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
-+ val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
-+ val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
-+ uint16x4x3_t v;
-+ __m128i val0, val1, val2, tmp0, tmp1;
-+ _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
-+ val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3
-+ val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
-+
-+ tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
-+ tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3,
-+ val0 = _mm_slli_si128(tmp0,10);
-+ val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
-+ val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
-+ val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
-+ val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
-+ _M64(v.val[0], val0);
-+
-+ val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
-+ val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
-+ val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
-+ val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
-+ val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
-+ _M64(v.val[1], val1);
-+
-+ tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
-+ tmp1 = _mm_srli_si128(tmp1,4);
-+ tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
-+ val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
-+{
-+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
-+ uint32x2x3_t v;
-+ __m128i val0, val1, val2;
-+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
-+ val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
-+
-+ val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
-+ _M64(v.val[0], val0);
-+ val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1,
-+ val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
-+ _M64(v.val[1], val1);
-+ val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x,
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
-+{
-+ uint64x1x3_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ v.val[2].m64_u64[0] = *(ptr + 2);
-+ return v;
-+}
-+
-+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
-+
-+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
-+
-+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
-+
-+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
-+
-+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
-+{
-+ //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
-+ float32x2x3_t v;
-+ v.val[0].m64_f32[0] = *(ptr);
-+ v.val[0].m64_f32[1] = *(ptr + 3);
-+
-+ v.val[1].m64_f32[0] = *(ptr + 1);
-+ v.val[1].m64_f32[1] = *(ptr + 4);
-+
-+ v.val[2].m64_f32[0] = *(ptr + 2);
-+ v.val[2].m64_f32[1] = *(ptr + 5);
-+ return v;
-+}
-+
-+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-+#define vld3_p8 vld3_u8
-+
-+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-+#define vld3_p16 vld3_u16
-+
-+//*************** Quadruples load ********************************
-+//*****************************************************************
-+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
-+{
-+ uint8x16x4_t v;
-+ __m128i tmp3, tmp2, tmp1, tmp0;
-+
-+ v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
-+ v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
-+ v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
-+ v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
-+
-+ tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
-+ tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
-+ tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
-+ tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
-+
-+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
-+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
-+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
-+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
-+
-+ tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
-+ tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
-+ tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
-+ tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
-+
-+ v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
-+ v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
-+ v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
-+ v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
-+ return v;
-+}
-+
-+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
-+{
-+ uint16x8x4_t v;
-+ __m128i tmp3, tmp2, tmp1, tmp0;
-+ tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7
-+ tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
-+ tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
-+ tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
-+ v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
-+ v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
-+ v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
-+ v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
-+ tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
-+ tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
-+ tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
-+ tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
-+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
-+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
-+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
-+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
-+ return v;
-+}
-+
-+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
-+{
-+ uint32x4x4_t v;
-+ __m128i tmp3, tmp2, tmp1, tmp0;
-+ v.val[0] = vld1q_u32 (ptr);
-+ v.val[1] = vld1q_u32 ((ptr + 4));
-+ v.val[2] = vld1q_u32 ((ptr + 8));
-+ v.val[3] = vld1q_u32 ((ptr + 12));
-+ tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
-+ tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
-+ tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
-+ tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
-+ v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
-+ v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
-+ v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
-+ v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
-+ return v;
-+}
-+
-+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
-+
-+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+#define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
-+
-+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+#define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
-+
-+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
-+{
-+ float32x4x4_t v;
-+ __m128 tmp3, tmp2, tmp1, tmp0;
-+
-+ v.val[0] = vld1q_f32 ((float*) ptr);
-+ v.val[1] = vld1q_f32 ((float*) (ptr + 4));
-+ v.val[2] = vld1q_f32 ((float*) (ptr + 8));
-+ v.val[3] = vld1q_f32 ((float*) (ptr + 12));
-+ tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
-+ tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
-+ tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
-+ tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
-+ v.val[0] = _mm_movelh_ps(tmp0, tmp2);
-+ v.val[1] = _mm_movehl_ps(tmp2, tmp0);
-+ v.val[2] = _mm_movelh_ps(tmp1, tmp3);
-+ v.val[3] = _mm_movehl_ps(tmp3, tmp1);
-+ return v;
-+}
-+
-+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-+#define vld4q_p8 vld4q_u8
-+
-+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-+#define vld4q_p16 vld4q_s16
-+
-+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
-+{
-+ uint8x8x4_t v;
-+ __m128i sh0, sh1;
-+ __m128i val0, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
-+
-+ val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
-+ val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
-+
-+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
-+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
-+ vst1q_u8(&v.val[0], val0 );
-+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
-+ vst1q_u8(&v.val[2], val2 );
-+ return v;
-+}
-+
-+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
-+{
-+ uint16x4x4_t v;
-+ __m128i sh0, sh1;
-+ __m128i val0, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
-+ val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
-+ val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
-+ sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
-+ val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
-+ vst1q_u16(&v.val[0], val0 );
-+ val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
-+ vst1q_u16(&v.val[2], val2 );
-+ return v;
-+}
-+
-+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
-+{
-+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-+ uint32x2x4_t v;
-+ __m128i val0, val01, val2;
-+ val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
-+ val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
-+ val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
-+ val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
-+ vst1q_u32(&v.val[0], val01);
-+ vst1q_u32(&v.val[2], val2 );
-+ return v;
-+}
-+
-+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
-+{
-+ uint64x1x4_t v;
-+ v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
-+ v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
-+ v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
-+ v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
-+ return v;
-+}
-+
-+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+#define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
-+
-+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
-+
-+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
-+
-+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
-+
-+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
-+{
-+ //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-+ float32x2x4_t res;
-+ res.val[0].m64_f32[0] = *(ptr);
-+ res.val[0].m64_f32[1] = *(ptr + 4);
-+ res.val[1].m64_f32[0] = *(ptr + 1);
-+ res.val[1].m64_f32[1] = *(ptr + 5);
-+ res.val[2].m64_f32[0] = *(ptr + 2);
-+ res.val[2].m64_f32[1] = *(ptr + 6);
-+ res.val[3].m64_f32[0] = *(ptr + 3);
-+ res.val[3].m64_f32[1] = *(ptr + 7);
-+ return res;
-+}
-+
-+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-+#define vld4_p8 vld4_u8
-+
-+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-+#define vld4_p16 vld4_u16
-+
-+//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
-+//*******************************************************************************************************************
-+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
-+{
-+ uint8x8x2_t v;
-+ __m128i val0, val1;
-+ val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
-+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
-+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
-+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-+ vst1q_u8(v.val, val0);
-+ return v;
-+}
-+
-+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
-+{
-+ uint16x4x2_t v;
-+ __m128i val0, val1;
-+ val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
-+ val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
-+ _M64(v.val[0], val0);
-+ val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
-+ _M64(v.val[1], val1);
-+ return v;
-+}
-+
-+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
-+{
-+ uint32x2x2_t v;
-+ __m128i val0;
-+ val0 = LOAD_SI128(ptr); //0,1,x,x
-+ val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
-+ vst1q_u32(v.val, val0);
-+ return v;
-+}
-+
-+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld2_dup_u64 vld2_u64
-+
-+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
-+
-+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
-+
-+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
-+
-+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-+#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
-+
-+float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
-+{
-+ float32x2x2_t v;
-+ v.val[0].m64_f32[0] = *(ptr); //0,0
-+ v.val[0].m64_f32[1] = *(ptr); //0,0
-+ v.val[1].m64_f32[0] = *(ptr + 1); //1,1
-+ v.val[1].m64_f32[1] = *(ptr + 1); //1,1
-+ return v;
-+}
-+
-+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-+#define vld2_dup_p8 vld2_dup_u8
-+
-+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-+#define vld2_dup_p16 vld2_dup_s16
-+
-+//************* Duplicate (or propagate)triplets: *******************
-+//********************************************************************
-+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
-+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+{
-+ uint8x8x3_t v;
-+ __m128i val0, val1, val2;
-+ val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
-+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
-+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
-+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
-+ vst1q_u8(v.val, val0);
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+{
-+ uint16x4x3_t v;
-+ __m128i val0, val1, val2;
-+ val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
-+ val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
-+ val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
-+ val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+{
-+ uint32x2x3_t v;
-+ __m128i val0, val1, val2;
-+ val2 = LOAD_SI128(ptr); //0,1,2,x
-+ val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
-+ val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
-+ val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ return v;
-+}
-+
-+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
-+{
-+ uint64x1x3_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ v.val[2].m64_u64[0] = *(ptr + 2);
-+ return v;
-+}
-+
-+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
-+
-+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
-+
-+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
-+
-+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-+#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
-+
-+
-+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
-+{
-+ float32x2x3_t v;
-+ int i;
-+ for (i = 0; i<3; i++) {
-+ v.val[i].m64_f32[0] = *(ptr + i);
-+ v.val[i].m64_f32[1] = *(ptr + i);
-+ }
-+ return v;
-+}
-+
-+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_p8 vld3_dup_u8
-+
-+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-+#define vld3_dup_p16 vld3_dup_s16
-+
-+
-+//************* Duplicate (or propagate) quadruples: *******************
-+//***********************************************************************
-+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
-+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ uint8x8x4_t v;
-+ __m128i val0, val1, val2;
-+ val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
-+ val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
-+ val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
-+ val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-+ val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
-+ vst1q_u8(&v.val[0], val0);
-+ vst1q_u8(&v.val[2], val2);
-+ return v;
-+}
-+
-+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ uint16x4x4_t v;
-+ __m128i val0, val1, val2, val3;
-+ val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
-+ val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
-+ val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
-+ val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
-+ val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ _M64(v.val[3], val3);
-+ return v;
-+}
-+
-+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ uint32x2x4_t v;
-+ __m128i val0, val1, val2, val3;
-+ val3 = LOAD_SI128(ptr); //0,1,2,3
-+ val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
-+ val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
-+ val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
-+ val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
-+ _M64(v.val[0], val0);
-+ _M64(v.val[1], val1);
-+ _M64(v.val[2], val2);
-+ _M64(v.val[3], val3);
-+ return v;
-+}
-+
-+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
-+{
-+ uint64x1x4_t v;
-+ v.val[0].m64_u64[0] = *(ptr);
-+ v.val[1].m64_u64[0] = *(ptr + 1);
-+ v.val[2].m64_u64[0] = *(ptr + 2);
-+ v.val[3].m64_u64[0] = *(ptr + 3);
-+ return v;
-+}
-+
-+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
-+
-+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
-+
-+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
-+
-+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-+#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
-+
-+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-+{
-+ float32x2x4_t v;
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ v.val[i].m64_f32[0] = *(ptr + i);
-+ v.val[i].m64_f32[1] = *(ptr + i);
-+ }
-+ return v;
-+}
-+
-+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_p8 vld4_dup_u8
-+
-+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-+#define vld4_dup_p16 vld4_dup_u16
-+
-+
-+//**********************************************************************************
-+//*******************Lane loads for an N-element structures ***********************
-+//**********************************************************************************
-+//********************** Lane pairs ************************************************
-+//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
-+//we assume src is 16 bit aligned
-+
-+//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
-+//to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
-+
-+//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
-+{
-+ uint16x8x2_t v;
-+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
-+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
-+{
-+ uint32x4x2_t v;
-+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
-+
-+//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
-+{
-+ int16x8x2_t v;
-+ v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
-+ v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
-+
-+//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
-+{
-+ int32x4x2_t v;
-+ v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
-+
-+//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
-+{
-+ float32x4x2_t v;
-+ v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
-+
-+//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-+#define vld2q_lane_p16 vld2q_lane_u16
-+
-+//uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
-+{
-+ uint8x8x2_t v;
-+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
-+
-+//uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane)
-+{
-+ uint16x4x2_t v;
-+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane)
-+{
-+ uint32x2x2_t v;
-+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane)
-+
-+//int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
-+
-+//int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
-+
-+//int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
-+
-+//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane)
-+{
-+ float32x2x2_t v;
-+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
-+ return v;
-+}
-+#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane)
-+
-+//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_p8 vld2_lane_u8
-+
-+//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-+#define vld2_lane_p16 vld2_lane_u16
-+
-+//*********** Lane triplets **********************
-+//*************************************************
-+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
-+//we assume src is 16 bit aligned
-+
-+//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ uint16x8x3_t v;
-+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ uint32x4x3_t v;
-+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
-+
-+//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ int16x8x3_t v;
-+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
-+
-+//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ int32x4x3_t v;
-+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
-+
-+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
-+
-+
-+//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-+{
-+ float32x4x3_t v;
-+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
-+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
-+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
-+ return v;
-+}
-+#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
-+
-+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-+#define vld3q_lane_p16 vld3q_lane_u16
-+
-+//uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ uint8x8x3_t v;
-+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
-+ return v;
-+}
-+#define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
-+
-+//uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ uint16x4x3_t v;
-+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
-+ return v;
-+}
-+#define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ //need to merge into 128 bit anyway
-+ uint32x2x3_t v;
-+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);;
-+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);;
-+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);;
-+ return v;
-+}
-+#define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
-+
-+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
-+
-+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
-+
-+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
-+
-+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-+{
-+ float32x2x3_t v;
-+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
-+ return v;
-+}
-+#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane)
-+
-+//poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_p8 vld3_lane_u8
-+
-+//poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-+#define vld3_lane_p16 vld3_lane_u16
-+
-+//******************* Lane Quadruples load ***************************
-+//*********************************************************************
-+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
-+//we assume src is 16 bit aligned
-+
-+//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
-+{
-+ uint16x8x4_t v;
-+ v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
-+ v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane);
-+ return v;
-+}
-+#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
-+{
-+ uint32x4x4_t v;
-+ v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
-+ v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
-+ v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
-+ v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane);
-+ return v;
-+}
-+#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
-+
-+//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
-+
-+//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+#define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
-+
-+//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
-+{
-+ float32x4x4_t v;
-+ v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
-+ v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
-+ v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
-+ v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
-+ return v;
-+}
-+#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
-+
-+//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+#define vld4q_lane_p16 vld4q_lane_u16
-+
-+//uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
-+{
-+ uint8x8x4_t v;
-+ v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
-+
-+//uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
-+{
-+ uint16x4x4_t v;
-+ v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
-+
-+//uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
-+{
-+ uint32x2x4_t v;
-+ v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
-+
-+//int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);
-+#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
-+
-+//int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);
-+#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
-+
-+//int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);
-+#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
-+
-+//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
-+{
-+ //serial solution may be faster
-+ float32x2x4_t v;
-+ v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
-+ v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
-+ v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
-+ v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane);
-+ return v;
-+}
-+#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane)
-+
-+//poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
-+#define vld4_lane_p8 vld4_lane_u8
-+
-+//poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);
-+#define vld4_lane_p16 vld4_lane_u16
-+
-+//******************* Store duplets *********************************************
-+//********************************************************************************
-+//here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function
-+//If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions
-+//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
-+{
-+ uint8x16x2_t v;
-+ v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
-+ vst1q_u8 (ptr, v.val[0]);
-+ vst1q_u8 ((ptr + 16), v.val[1]);
-+}
-+#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
-+
-+//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
-+{
-+ uint16x8x2_t v;
-+ v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
-+ vst1q_u16 (ptr, v.val[0]);
-+ vst1q_u16 ((ptr + 8), v.val[1]);
-+}
-+#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
-+
-+//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
-+{
-+ uint32x4x2_t v;
-+ v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
-+ vst1q_u32 (ptr, v.val[0]);
-+ vst1q_u32 ((ptr + 4), v.val[1]);
-+}
-+#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
-+
-+//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
-+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
-+#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
-+
-+//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
-+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
-+#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
-+
-+//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
-+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
-+#define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
-+
-+//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
-+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
-+_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
-+{
-+ float32x4x2_t v;
-+ v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
-+ v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
-+ vst1q_f32 (ptr, v.val[0]);
-+ vst1q_f32 ((ptr + 4), v.val[1]);
-+}
-+#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
-+
-+//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
-+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
-+#define vst2q_p8 vst2q_u8
-+
-+//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
-+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
-+#define vst2q_p16 vst2q_u16
-+
-+//void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
-+{
-+ __m128i v0;
-+ v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ vst1q_u8 (ptr, v0);
-+}
-+#define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
-+
-+//void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
-+{
-+ __m128i v0;
-+ v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ vst1q_u16 (ptr, v0);
-+}
-+#define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
-+
-+//void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
-+{
-+ __m128i v0;
-+ v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ vst1q_u32 (ptr, v0);
-+}
-+#define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
-+
-+
-+//void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
-+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
-+_NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
-+{
-+ *(ptr) = val->val[0].m64_u64[0];
-+ *(ptr + 1) = val->val[1].m64_u64[0];
-+}
-+#define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
-+
-+//void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
-+#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
-+
-+//void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
-+#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
-+
-+//void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
-+#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
-+
-+//void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
-+#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
-+
-+//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
-+_NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
-+{
-+ *(ptr) = val->val[0].m64_f32[0];
-+ *(ptr + 1) = val->val[1].m64_f32[0];
-+ *(ptr + 2) = val->val[0].m64_f32[1];
-+ *(ptr + 3) = val->val[1].m64_f32[1];
-+}
-+#define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
-+
-+//void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
-+#define vst2_p8 vst2_u8
-+
-+//void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-+#define vst2_p16 vst2_u16
-+
-+//******************** Triplets store *****************************************
-+//******************************************************************************
-+//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
-+{
-+ uint8x16x3_t v;
-+ __m128i v0,v1,v2, cff, bldmask;
-+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
-+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
-+
-+ v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
-+ v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
-+ v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
-+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
-+ cff = _mm_cmpeq_epi8(v0, v0); //all ff
-+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
-+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
-+ vst1q_u8(ptr, v.val[0]);
-+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
-+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
-+ vst1q_u8((ptr + 16), v.val[1]);
-+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
-+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
-+ vst1q_u8((ptr + 32), v.val[2]);
-+}
-+#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
-+
-+//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
-+{
-+ uint16x8x3_t v;
-+ __m128i v0,v1,v2, cff, bldmask;
-+ _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
-+ _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
-+ _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
-+
-+ v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
-+ v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
-+ v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
-+ v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
-+ cff = _mm_cmpeq_epi16(v0, v0); //all ff
-+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
-+ v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
-+ vst1q_u16(ptr, v.val[0]);
-+ v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
-+ v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
-+ vst1q_u16((ptr + 8), v.val[1]);
-+ v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
-+ v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
-+ bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
-+ v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
-+ vst1q_u16((ptr + 16), v.val[2]);
-+}
-+#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
-+
-+//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
-+{
-+ //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
-+ uint32x4x3_t v;
-+ __m128i tmp0, tmp1,tmp2;
-+ tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
-+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
-+ tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
-+ v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
-+ v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
-+ v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
-+ tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
-+ v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
-+
-+ vst1q_u32(ptr, v.val[0]);
-+ vst1q_u32((ptr + 4), v.val[1]);
-+ vst1q_u32((ptr + 8), v.val[2]);
-+}
-+#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
-+
-+//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
-+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
-+#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
-+
-+//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
-+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
-+#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
-+
-+//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
-+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
-+#define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
-+
-+//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
-+_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
-+{
-+ float32x4x3_t v;
-+ __m128 tmp0, tmp1,tmp2;
-+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
-+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
-+ tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
-+ v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
-+ v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
-+ v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
-+ tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
-+ v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
-+
-+ vst1q_f32( ptr, v.val[0]);
-+ vst1q_f32( (ptr + 4), v.val[1]);
-+ vst1q_f32( (ptr + 8), v.val[2]);
-+}
-+#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
-+
-+//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
-+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
-+#define vst3q_p8 vst3q_u8
-+
-+//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
-+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
-+#define vst3q_p16 vst3q_u16
-+
-+//void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
-+{
-+ __m128i tmp, sh0, sh1, val0, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
-+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
-+ _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
-+ _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
-+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) );
-+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
-+ val2 = _pM128i(val->val[2]);
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
-+ val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
-+ vst1q_u8(ptr, val0); //store as 128 bit structure
-+ sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
-+ sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
-+ val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
-+ _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory
-+}
-+#define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
-+
-+//void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
-+{
-+ __m128i tmp, val0, val1, val2;
-+ _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
-+ _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0};
-+ _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1]
-+ _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0]
-+ tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]));
-+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
-+ val2 = _pM128i(val->val[2]);
-+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
-+ val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
-+ vst1q_u16(ptr, val0); //store as 128 bit structure
-+ val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
-+ val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
-+ val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order
-+ _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory
-+}
-+#define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
-+
-+//void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
-+{
-+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
-+ __m128i val0, val1;
-+ val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5
-+ val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
-+ val1 = _mm_srli_si128(val0, 8); //4,5, x,x
-+ _M64((*(__m64_128*)(ptr + 4)), val1);
-+ val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2
-+ val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
-+ vst1q_u32(ptr, val0); //store as 128 bit structure
-+}
-+#define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
-+
-+//void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
-+{
-+ *(ptr) = val->val[0].m64_u64[0];
-+ *(ptr + 1) = val->val[1].m64_u64[0];
-+ *(ptr + 2) = val->val[2].m64_u64[0];
-+}
-+#define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
-+
-+//void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0]
-+#define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
-+
-+//void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0]
-+#define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
-+
-+//void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
-+#define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
-+
-+//void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0]
-+#define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
-+
-+//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
-+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+//void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
-+_NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
-+{
-+ //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5
-+ *(ptr) = val->val[0].m64_f32[0];
-+ *(ptr + 1) = val->val[1].m64_f32[0];
-+ *(ptr + 2) = val->val[2].m64_f32[0];
-+ *(ptr + 3) = val->val[0].m64_f32[1];
-+ *(ptr + 4) = val->val[1].m64_f32[1];
-+ *(ptr + 5) = val->val[2].m64_f32[1];
-+}
-+#define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
-+
-+//void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
-+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
-+#define vst3_p8 vst3_u8
-+
-+//void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
-+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
-+#define vst3_p16 vst3_s16
-+
-+//*************** Quadruples store ********************************
-+//*********************************************************************
-+//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
-+{
-+ __m128i tmp1, tmp2, res;
-+ tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
-+ tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
-+ res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
-+ vst1q_u8(ptr, res);
-+ res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
-+ vst1q_u8((ptr + 16), res);
-+ tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
-+ tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
-+ res = _mm_unpacklo_epi16(tmp1, tmp2); //
-+ vst1q_u8((ptr + 32), res);
-+ res = _mm_unpackhi_epi16(tmp1, tmp2); //
-+ vst1q_u8((ptr + 48), res);
-+}
-+#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
-+
-+//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
-+{
-+ uint16x8x4_t v;
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
-+ v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
-+ tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
-+ v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
-+ vst1q_u16(ptr, v.val[0]);
-+ vst1q_u16((ptr + 8), v.val[1]);
-+ vst1q_u16((ptr + 16),v.val[2]);
-+ vst1q_u16((ptr + 24), v.val[3]);
-+}
-+#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
-+
-+//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
-+{
-+ uint16x8x4_t v;
-+ __m128i tmp1, tmp2;
-+ tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
-+ v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
-+ tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
-+ tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
-+ v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
-+ v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
-+ vst1q_u32(ptr, v.val[0]);
-+ vst1q_u32((ptr + 4), v.val[1]);
-+ vst1q_u32((ptr + 8), v.val[2]);
-+ vst1q_u32((ptr + 12), v.val[3]);
-+}
-+#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
-+
-+//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
-+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
-+#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
-+
-+//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
-+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
-+#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
-+
-+//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
-+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
-+#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
-+
-+//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
-+_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
-+{
-+ __m128 tmp3, tmp2, tmp1, tmp0;
-+ float32x4x4_t v;
-+ tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
-+ tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
-+ tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
-+ tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
-+ v.val[0] = _mm_movelh_ps(tmp0, tmp2);
-+ v.val[1] = _mm_movehl_ps(tmp2, tmp0);
-+ v.val[2] = _mm_movelh_ps(tmp1, tmp3);
-+ v.val[3] = _mm_movehl_ps(tmp3, tmp1);
-+ vst1q_f32(ptr, v.val[0]);
-+ vst1q_f32((ptr + 4), v.val[1]);
-+ vst1q_f32((ptr + 8), v.val[2]);
-+ vst1q_f32((ptr + 12), v.val[3]);
-+}
-+#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
-+
-+//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
-+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
-+#define vst4q_p8 vst4q_u8
-+
-+//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
-+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
-+#define vst4q_p16 vst4q_s16
-+
-+//void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
-+{
-+ __m128i sh0, sh1, val0, val2;
-+ sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
-+ sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
-+ val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
-+ val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
-+ vst1q_u8(ptr, val0);
-+ vst1q_u8((ptr + 16), val2);
-+}
-+#define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
-+
-+//void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
-+{
-+ __m128i sh0, sh1, val0, val2;
-+ sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
-+ sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
-+ val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
-+ val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
-+ vst1q_u16(ptr, val0); //store as 128 bit structure
-+ vst1q_u16((ptr + 8), val2);
-+}
-+#define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
-+
-+//void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
-+{
-+ //0,4, 1,5, 2,6, 3,7
-+ __m128i sh0, sh1, val0, val1;
-+ sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5
-+ sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7
-+ val0 = _mm_unpacklo_epi64(sh0,sh1); //
-+ val1 = _mm_unpackhi_epi64(sh0,sh1); //
-+ vst1q_u32(ptr, val0); //store as 128 bit structure
-+ vst1q_u32((ptr + 4), val1);
-+}
-+#define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
-+
-+//void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
-+{
-+ *(ptr) = val->val[0].m64_u64[0];
-+ *(ptr + 1) = val->val[1].m64_u64[0];
-+ *(ptr + 2) = val->val[2].m64_u64[0];
-+ *(ptr + 3) = val->val[3].m64_u64[0];
-+}
-+#define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
-+
-+//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0]
-+#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
-+
-+//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0]
-+#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
-+
-+//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
-+#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
-+
-+//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
-+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
-+#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
-+
-+//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
-+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-+
-+//void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
-+_NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
-+{
-+ //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7
-+ *(ptr) = val->val[0].m64_f32[0];
-+ *(ptr + 1) = val->val[1].m64_f32[0];
-+ *(ptr + 2) = val->val[2].m64_f32[0];
-+ *(ptr + 3) = val->val[3].m64_f32[0];
-+ *(ptr + 4) = val->val[0].m64_f32[1];
-+ *(ptr + 5) = val->val[1].m64_f32[1];
-+ *(ptr + 6) = val->val[2].m64_f32[1];
-+ *(ptr + 7) = val->val[3].m64_f32[1];
-+}
-+#define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
-+
-+//void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
-+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);
-+#define vst4_p8 vst4_u8
-+
-+//void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
-+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);
-+#define vst4_p16 vst4_u16
-+
-+//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors *********************
-+//********************************************************************************************************************
-+//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
-+{
-+ vst1q_lane_s16(ptr, val->val[0], lane);
-+ vst1q_lane_s16((ptr + 1), val->val[1], lane);
-+}
-+#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_u32(ptr, val->val[0], lane);
-+ vst1q_lane_u32((ptr + 1), val->val[1], lane);
-+}
-+#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
-+#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
-+
-+//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
-+#define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane)
-+
-+//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_f32(ptr, val->val[0], lane);
-+ vst1q_lane_f32((ptr + 1), val->val[1], lane);
-+}
-+#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
-+
-+//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
-+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
-+#define vst2q_lane_p16 vst2q_lane_s16
-+
-+//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
-+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
-+{
-+ *(ptr) = val->val[0].m64_u8[lane];
-+ *(ptr + 1) = val->val[1].m64_u8[lane];
-+}
-+#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane)
-+
-+//void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u16[lane];
-+ *(ptr + 1) = val->val[1].m64_u16[lane];
-+}
-+#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
-+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u32[lane];
-+ *(ptr + 1) = val->val[1].m64_u32[lane];
-+}
-+#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
-+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
-+#define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane)
-+
-+//void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
-+#define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane)
-+
-+//void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
-+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
-+#define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane)
-+
-+//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
-+//current IA SIMD doesn't support float16
-+
-+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
-+_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_f32[lane];
-+ *(ptr + 1) = val->val[1].m64_f32[lane];
-+}
-+#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane)
-+
-+//void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
-+#define vst2_lane_p8 vst2_lane_u8
-+
-+//void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-+#define vst2_lane_p16 vst2_lane_u16
-+
-+//************************* Triple lanes stores *******************************************************
-+//*******************************************************************************************************
-+//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
-+{
-+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
-+ vst1q_lane_u16((ptr + 2), val->val[2], lane);
-+}
-+#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
-+{
-+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
-+ vst1q_lane_u32((ptr + 2), val->val[2], lane);
-+}
-+#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
-+
-+//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
-+#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
-+
-+//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-+_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_f32(ptr, val->val[0], lane);
-+ vst1q_lane_f32((ptr + 1), val->val[1], lane);
-+ vst1q_lane_f32((ptr + 2), val->val[2], lane);
-+}
-+#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
-+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3q_lane_p16 vst3q_lane_s16
-+
-+//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u8[lane];
-+ *(ptr + 1) = val->val[1].m64_u8[lane];
-+ *(ptr + 2) = val->val[2].m64_u8[lane];
-+}
-+#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane)
-+
-+//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u16[lane];
-+ *(ptr + 1) = val->val[1].m64_u16[lane];
-+ *(ptr + 2) = val->val[2].m64_u16[lane];
-+}
-+#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
-+_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u32[lane];
-+ *(ptr + 1) = val->val[1].m64_u32[lane];
-+ *(ptr + 2) = val->val[2].m64_u32[lane];
-+}
-+#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
-+
-+//void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);
-+#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
-+
-+//void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);
-+#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
-+
-+//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
-+_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_f32[lane];
-+ *(ptr + 1) = val->val[1].m64_f32[lane];
-+ *(ptr + 2) = val->val[2].m64_f32[lane];
-+}
-+#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
-+#define vst3_lane_p8 vst3_lane_u8
-+
-+//void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
-+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);
-+#define vst3_lane_p16 vst3_lane_s16
-+
-+//******************************** Quadruple lanes stores ***********************************************
-+//*******************************************************************************************************
-+//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
-+{
-+ vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane);
-+ vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
-+}
-+#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
-+{
-+ vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane);
-+ vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
-+}
-+#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
-+#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
-+
-+//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
-+#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
-+
-+//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
-+//current IA SIMD doesn't support float16
-+
-+//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
-+{
-+ vst1q_lane_f32(ptr, val->val[0], lane);
-+ vst1q_lane_f32((ptr + 1), val->val[1], lane);
-+ vst1q_lane_f32((ptr + 2), val->val[2], lane);
-+ vst1q_lane_f32((ptr + 3), val->val[3], lane);
-+}
-+#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
-+#define vst4q_lane_p16 vst4q_lane_u16
-+
-+//void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u8[lane];
-+ *(ptr + 1) = val->val[1].m64_u8[lane];
-+ *(ptr + 2) = val->val[2].m64_u8[lane];
-+ *(ptr + 3) = val->val[3].m64_u8[lane];
-+}
-+#define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
-+
-+//void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u16[lane];
-+ *(ptr + 1) = val->val[1].m64_u16[lane];
-+ *(ptr + 2) = val->val[2].m64_u16[lane];
-+ *(ptr + 3) = val->val[3].m64_u16[lane];
-+}
-+#define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
-+
-+//void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_u32[lane];
-+ *(ptr + 1) = val->val[1].m64_u32[lane];
-+ *(ptr + 2) = val->val[2].m64_u32[lane];
-+ *(ptr + 3) = val->val[3].m64_u32[lane];
-+}
-+#define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
-+
-+//void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
-+
-+//void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
-+
-+//void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
-+
-+//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
-+//current IA SIMD doesn't support float16
-+
-+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane)
-+{
-+ *(ptr) = val->val[0].m64_f32[lane];
-+ *(ptr + 1) = val->val[1].m64_f32[lane];
-+ *(ptr + 2) = val->val[2].m64_f32[lane];
-+ *(ptr + 3) = val->val[3].m64_f32[lane];
-+}
-+#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane)
-+
-+//void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
-+#define vst4_lane_p8 vst4_lane_u8
-+
-+//void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);
-+#define vst4_lane_p16 vst4_lane_u16
-+
-+//**************************************************************************************************
-+//************************ Extract lanes from a vector ********************************************
-+//**************************************************************************************************
-+//These intrinsics extract a single lane (element) from a vector.
-+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+#define vget_lane_u8(vec, lane) vec.m64_u8[lane]
-+
-+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
-+#define vget_lane_u16(vec, lane) vec.m64_u16[lane]
-+
-+
-+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+#define vget_lane_u32(vec, lane) vec.m64_u32[lane]
-+
-+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
-+#define vget_lane_s8(vec, lane) vec.m64_i8[lane]
-+
-+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
-+#define vget_lane_s16(vec, lane) vec.m64_i16[lane]
-+
-+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+#define vget_lane_s32(vec, lane) vec.m64_i32[lane]
-+
-+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-+#define vget_lane_p8 vget_lane_u8
-+
-+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
-+#define vget_lane_p16 vget_lane_u16
-+
-+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-+#define vget_lane_f32(vec, lane) vec.m64_f32[lane]
-+
-+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+#define vgetq_lane_u8 _MM_EXTRACT_EPI8
-+
-+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
-+#define vgetq_lane_u16 _MM_EXTRACT_EPI16
-+
-+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+#define vgetq_lane_u32 _MM_EXTRACT_EPI32
-+
-+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
-+#define vgetq_lane_s8 vgetq_lane_u8
-+
-+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
-+#define vgetq_lane_s16 vgetq_lane_u16
-+
-+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+#define vgetq_lane_s32 vgetq_lane_u32
-+
-+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-+#define vgetq_lane_p8 vgetq_lane_u8
-+
-+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
-+#define vgetq_lane_p16 vgetq_lane_u16
-+
-+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-+_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
-+{
-+ int32_t ilane;
-+ ilane = _MM_EXTRACT_PS(vec,lane);
-+ return *(float*)&ilane;
-+}
-+
-+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+#define vget_lane_s64(vec, lane) vec.m64_i64[0]
-+
-+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-+#define vget_lane_u64(vec, lane) vec.m64_u64[0]
-+
-+
-+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+#define vgetq_lane_s64 (int64_t) vgetq_lane_u64
-+
-+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-+#define vgetq_lane_u64 _MM_EXTRACT_EPI64
-+
-+// ***************** Set lanes within a vector ********************************************
-+// **************************************************************************************
-+//These intrinsics set a single lane (element) within a vector.
-+//same functions as vld1_lane_xx ones, but take the value to be set directly.
-+
-+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint8_t val;
-+ val = value;
-+ return vld1_lane_u8(&val, vec, lane);
-+}
-+
-+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint16_t val;
-+ val = value;
-+ return vld1_lane_u16(&val, vec, lane);
-+}
-+
-+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint32_t val;
-+ val = value;
-+ return vld1_lane_u32(&val, vec, lane);
-+}
-+
-+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
-+{
-+ int8_t val;
-+ val = value;
-+ return vld1_lane_s8(&val, vec, lane);
-+}
-+
-+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
-+{
-+ int16_t val;
-+ val = value;
-+ return vld1_lane_s16(&val, vec, lane);
-+}
-+
-+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
-+{
-+ int32_t val;
-+ val = value;
-+ return vld1_lane_s32(&val, vec, lane);
-+}
-+
-+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-+#define vset_lane_p8 vset_lane_u8
-+
-+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-+#define vset_lane_p16 vset_lane_u16
-+
-+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
-+{
-+ float32_t val;
-+ val = value;
-+ return vld1_lane_f32(&val, vec, lane);
-+}
-+
-+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
-+{
-+ uint8_t val;
-+ val = value;
-+ return vld1q_lane_u8(&val, vec, lane);
-+}
-+
-+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint16_t val;
-+ val = value;
-+ return vld1q_lane_u16(&val, vec, lane);
-+}
-+
-+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint32_t val;
-+ val = value;
-+ return vld1q_lane_u32(&val, vec, lane);
-+}
-+
-+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
-+{
-+ int8_t val;
-+ val = value;
-+ return vld1q_lane_s8(&val, vec, lane);
-+}
-+
-+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
-+{
-+ int16_t val;
-+ val = value;
-+ return vld1q_lane_s16(&val, vec, lane);
-+}
-+
-+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
-+{
-+ int32_t val;
-+ val = value;
-+ return vld1q_lane_s32(&val, vec, lane);
-+}
-+
-+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-+#define vsetq_lane_p8 vsetq_lane_u8
-+
-+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-+#define vsetq_lane_p16 vsetq_lane_u16
-+
-+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-+_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
-+{
-+ float32_t val;
-+ val = value;
-+ return vld1q_lane_f32(&val, vec, lane);
-+}
-+
-+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
-+{
-+ int64_t val;
-+ val = value;
-+ return vld1_lane_s64(&val, vec, lane);
-+}
-+
-+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
-+{
-+ uint64_t val;
-+ val = value;
-+ return vld1_lane_u64(&val, vec, lane);
-+}
-+
-+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint64_t val;
-+ val = value;
-+ return vld1q_lane_s64(&val, vec, lane);
-+}
-+
-+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-+#define vsetq_lane_u64 vsetq_lane_s64
-+
-+// *******************************************************************************
-+// **************** Initialize a vector from bit pattern ***************************
-+// *******************************************************************************
-+//These intrinsics create a vector from a literal bit pattern.
-+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s8(a) (*(__m64_128*)&(a))
-+
-+
-+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s16 vcreate_s8
-+
-+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s32 vcreate_s8
-+
-+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
-+//no IA32 SIMD avalilable
-+
-+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_f32(a) (*(__m64_128*)&(a))
-+
-+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u8 vcreate_s8
-+
-+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u16 vcreate_s16
-+
-+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u32 vcreate_s32
-+
-+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_u64 vcreate_s8
-+
-+
-+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_p8 vcreate_u8
-+
-+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_p16 vcreate_u16
-+
-+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
-+#define vcreate_s64 vcreate_u64
-+
-+//********************* Set all lanes to same value ********************************
-+//*********************************************************************************
-+//These intrinsics set all lanes to the same value.
-+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint8x8_t res;
-+ int i;
-+ for (i = 0; i<8; i++) {
-+ res.m64_u8[i] = value;
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint16x4_t res;
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ res.m64_u16[i] = value;
-+ }
-+ return res;
-+}
-+
-+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = value;
-+ res.m64_u32[1] = value;
-+ return res;
-+}
-+
-+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int8x8_t res;
-+ int i;
-+ for (i = 0; i<8; i++) {
-+ res.m64_i8[i] = value;
-+ }
-+ return res;
-+}
-+
-+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int16x4_t res;
-+ int i;
-+ for (i = 0; i<4; i++) {
-+ res.m64_i16[i] = value;
-+ }
-+ return res;
-+}
-+
-+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t res;
-+ res.m64_i32[0] = value;
-+ res.m64_i32[1] = value;
-+ return res;
-+}
-+
-+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
-+#define vdup_n_p8 vdup_n_u8
-+
-+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
-+#define vdup_n_p16 vdup_n_s16
-+
-+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
-+_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = value;
-+ res.m64_f32[1] = value;
-+ return res;
-+}
-+
-+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
-+
-+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
-+
-+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
-+
-+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
-+#define vdupq_n_s8 _mm_set1_epi8
-+
-+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
-+#define vdupq_n_s16 _mm_set1_epi16
-+
-+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
-+#define vdupq_n_s32 _mm_set1_epi32
-+
-+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+#define vdupq_n_p8 vdupq_n_u8
-+
-+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+#define vdupq_n_p16 vdupq_n_u16
-+
-+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
-+#define vdupq_n_f32 _mm_set1_ps
-+
-+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = value;
-+ return res;
-+}
-+
-+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value)
-+{
-+ uint64x1_t res;
-+ res.m64_u64[0] = value;
-+ return res;
-+}
-+
-+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value)
-+{
-+ _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
-+ return LOAD_SI128(value2);
-+}
-+
-+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+_NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value)
-+{
-+ _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
-+ return LOAD_SI128(val);
-+}
-+
-+//**** Set all lanes to same value ************************
-+//Same functions as above - just aliaces.********************
-+//Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
-+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
-+#define vmov_n_u8 vdup_n_s8
-+
-+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
-+#define vmov_n_u16 vdup_n_s16
-+
-+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
-+#define vmov_n_u32 vdup_n_u32
-+
-+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
-+#define vmov_n_s8 vdup_n_s8
-+
-+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
-+#define vmov_n_s16 vdup_n_s16
-+
-+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
-+#define vmov_n_s32 vdup_n_s32
-+
-+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
-+#define vmov_n_p8 vdup_n_u8
-+
-+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
-+#define vmov_n_p16 vdup_n_s16
-+
-+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
-+#define vmov_n_f32 vdup_n_f32
-+
-+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
-+#define vmovq_n_u8 vdupq_n_u8
-+
-+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
-+#define vmovq_n_u16 vdupq_n_s16
-+
-+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
-+#define vmovq_n_u32 vdupq_n_u32
-+
-+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
-+#define vmovq_n_s8 vdupq_n_s8
-+
-+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
-+#define vmovq_n_s16 vdupq_n_s16
-+
-+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
-+#define vmovq_n_s32 vdupq_n_s32
-+
-+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
-+#define vmovq_n_p8 vdupq_n_u8
-+
-+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
-+#define vmovq_n_p16 vdupq_n_s16
-+
-+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
-+#define vmovq_n_f32 vdupq_n_f32
-+
-+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
-+#define vmov_n_s64 vdup_n_s64
-+
-+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
-+#define vmov_n_u64 vdup_n_u64
-+
-+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
-+#define vmovq_n_s64 vdupq_n_s64
-+
-+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
-+#define vmovq_n_u64 vdupq_n_u64
-+
-+//**************Set all lanes to the value of one lane of a vector *************
-+//****************************************************************************
-+//here shuffle is better solution than lane extraction followed by set1 function
-+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
-+{
-+ uint8x8_t res;
-+ uint8_t valane;
-+ int i = 0;
-+ valane = vec.m64_u8[lane];
-+ for (i = 0; i<8; i++) {
-+ res.m64_u8[i] = valane;
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
-+{
-+ uint16x4_t res;
-+ uint16_t valane;
-+ valane = vec.m64_u16[lane];
-+ res.m64_u16[0] = valane;
-+ res.m64_u16[1] = valane;
-+ res.m64_u16[2] = valane;
-+ res.m64_u16[3] = valane;
-+ return res;
-+}
-+
-+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
-+{
-+ uint32x2_t res;
-+ res.m64_u32[0] = vec.m64_u32[lane];
-+ res.m64_u32[1] = res.m64_u32[0];
-+ return res;
-+}
-+
-+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+#define vdup_lane_s8 vdup_lane_u8
-+
-+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+#define vdup_lane_s16 vdup_lane_u16
-+
-+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+#define vdup_lane_s32 vdup_lane_u32
-+
-+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-+#define vdup_lane_p8 vdup_lane_u8
-+
-+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-+#define vdup_lane_p16 vdup_lane_s16
-+
-+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-+_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = vec.m64_f32[lane];
-+ res.m64_f32[1] = res.m64_f32[0];
-+ return res;
-+}
-+
-+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
-+{
-+ _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane};
-+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
-+}
-+
-+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
-+{
-+ //we could use 8bit shuffle for 16 bit as well
-+ const int8_t lane16 = ((int8_t) lane) << 1;
-+ _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1,
-+ lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1};
-+ return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
-+}
-+
-+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+#define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6))
-+
-+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+#define vdupq_lane_s8 vdupq_lane_u8
-+
-+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+#define vdupq_lane_s16 vdupq_lane_u16
-+
-+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+#define vdupq_lane_s32 vdupq_lane_u32
-+
-+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-+#define vdupq_lane_p8 vdupq_lane_u8
-+
-+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-+#define vdupq_lane_p16 vdupq_lane_s16
-+
-+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-+#define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane))
-+
-+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+#define vdup_lane_s64(vec,lane) vec
-+
-+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-+#define vdup_lane_u64(vec,lane) vec
-+
-+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
-+{
-+ __m128i vec128;
-+ vec128 = _pM128i(vec);
-+ return _mm_unpacklo_epi64(vec128,vec128);
-+}
-+
-+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-+#define vdupq_lane_u64 vdupq_lane_s64
-+
-+// ********************************************************************
-+// ******************** Combining vectors *****************************
-+// ********************************************************************
-+//These intrinsics join two 64 bit vectors into a single 128bit vector.
-+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
-+#define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
-+#define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
-+#define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
-+#define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
-+
-+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
-+//current IA SIMD doesn't support float16
-+
-+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
-+_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
-+{
-+ __m128i res;
-+ res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
-+ return _M128(res);
-+}
-+
-+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
-+#define vcombine_u8 vcombine_s8
-+
-+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
-+#define vcombine_u16 vcombine_s16
-+
-+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
-+#define vcombine_u32 vcombine_s32
-+
-+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
-+#define vcombine_u64 vcombine_s64
-+
-+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
-+#define vcombine_p8 vcombine_u8
-+
-+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
-+#define vcombine_p16 vcombine_u16
-+
-+//**********************************************************************
-+//************************* Splitting vectors **************************
-+//**********************************************************************
-+//**************** Get high part ******************************************
-+//These intrinsics split a 128 bit vector into 2 component 64 bit vectors
-+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
-+{
-+ int64x1_t res64;
-+ __m128i res;
-+ res = _mm_unpackhi_epi64(a,a); //SSE2
-+ return64(res);
-+}
-+
-+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
-+{
-+ __m128i res;
-+ __m64_128 res64;
-+ res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
-+ return64(res);
-+}
-+
-+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
-+#define vget_high_u8 vget_high_s8
-+
-+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
-+#define vget_high_u16 vget_high_s16
-+
-+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
-+#define vget_high_u32 vget_high_s32
-+
-+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
-+#define vget_high_u64 vget_high_s64
-+
-+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
-+#define vget_high_p8 vget_high_u8
-+
-+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
-+#define vget_high_p16 vget_high_u16
-+
-+//********************** Get low part **********************
-+//**********************************************************
-+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
-+{
-+ int16x4_t res64;
-+ return64(a);
-+}
-+
-+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
-+{
-+ int16x4_t res64;
-+ return64(a);
-+}
-+
-+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
-+{
-+ int32x2_t res64;
-+ return64(a);
-+}
-+
-+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
-+{
-+ int64x1_t res64;
-+ return64 (a);
-+}
-+
-+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
-+// IA32 SIMD doesn't work with 16bit floats currently
-+
-+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
-+_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
-+{
-+ float32x2_t res64;
-+ _M64f(res64, a);
-+ return res64;
-+}
-+
-+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
-+#define vget_low_u8 vget_low_s8
-+
-+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
-+#define vget_low_u16 vget_low_s16
-+
-+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
-+#define vget_low_u32 vget_low_s32
-+
-+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
-+#define vget_low_u64 vget_low_s64
-+
-+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
-+#define vget_low_p8 vget_low_u8
-+
-+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
-+#define vget_low_p16 vget_low_s16
-+
-+//**************************************************************************
-+//************************ Converting vectors **********************************
-+//**************************************************************************
-+//************* Convert from float ***************************************
-+// need to set _MM_SET_ROUNDING_MODE ( x) accordingly
-+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
-+_NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only
-+ return64(res);
-+}
-+
-+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
-+_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
-+{
-+ //may be not effective compared with a serial SIMD solution
-+ uint32x2_t res64;
-+ __m128i res;
-+ res = vcvtq_u32_f32(_pM128(a));
-+ return64(res);
-+}
-+
-+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
-+#define vcvtq_s32_f32 _mm_cvttps_epi32
-+
-+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
-+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
-+{
-+ //No single instruction SSE solution but we could implement it as following:
-+ __m128i resi;
-+ __m128 zero, mask, a_pos, mask_f_max_si, res;
-+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-+ zero = _mm_setzero_ps();
-+ mask = _mm_cmpgt_ps(a, zero);
-+ a_pos = _mm_and_ps(a, mask);
-+ mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
-+ res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything
-+ resi = _mm_cvttps_epi32(res);
-+ return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
-+}
-+
-+// ***** Convert to the fixed point with the number of fraction bits specified by b ***********
-+//*************************************************************************************************
-+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
-+_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
-+{
-+ int32x2_t res64;
-+ return64(vcvtq_n_s32_f32(_pM128(a),b));
-+}
-+
-+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
-+_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
-+{
-+ uint32x2_t res;
-+ float convconst;
-+ convconst = (float)((uint32_t)1 << b);
-+ res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
-+ res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
-+ return res;
-+}
-+
-+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
-+_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ __m128 cconst128;
-+ __m128i mask, res;
-+ convconst = (float)(1 << b);
-+ cconst128 = vdupq_n_f32(convconst);
-+ res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
-+ mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
-+ return _mm_xor_si128 (res, mask); //res saturated for 0x80000000
-+}
-+
-+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
-+_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ __m128 cconst128;
-+ convconst = (float)(1 << b);
-+ cconst128 = vdupq_n_f32(convconst);
-+ return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
-+}
-+
-+//***************** Convert to float *************************
-+//*************************************************************
-+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
-+_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = (float) a.m64_i32[0];
-+ res.m64_f32[1] = (float) a.m64_i32[1];
-+ return res;
-+}
-+
-+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
-+_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = (float) a.m64_u32[0];
-+ res.m64_f32[1] = (float) a.m64_u32[1];
-+ return res;
-+}
-+
-+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
-+#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
-+
-+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
-+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
-+{
-+ //solution may be not optimal
-+ __m128 two16, fHi, fLo;
-+ __m128i hi, lo;
-+ two16 = _mm_set1_ps((float)0x10000); //2^16
-+ // Avoid double rounding by doing two exact conversions
-+ // of high and low 16-bit segments
-+ hi = _mm_srli_epi32(a, 16);
-+ lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
-+ fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
-+ fLo = _mm_cvtepi32_ps(lo);
-+ // do single rounding according to current rounding mode
-+ return _mm_add_ps(fHi, fLo);
-+}
-+
-+// ***** Convert to the float from fixed point with the number of fraction bits specified by b ***********
-+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
-+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
-+{
-+ float32x2_t res;
-+ float convconst;
-+ convconst = (float)(1. / ((uint32_t)1 << b));
-+ res.m64_f32[0] = a.m64_i32[0] * convconst;
-+ res.m64_f32[1] = a.m64_i32[1] * convconst;
-+ return res;
-+}
-+
-+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
-+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
-+{
-+ float32x2_t res;
-+ float convconst;
-+ convconst = (float)(1. / ((uint32_t)1 << b));
-+ res.m64_f32[0] = a.m64_u32[0] * convconst;
-+ res.m64_f32[1] = a.m64_u32[1] * convconst;
-+ return res;
-+}
-+
-+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
-+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ __m128 cconst128, af;
-+ convconst = (float)(1. / ((uint32_t)1 << b));
-+ af = _mm_cvtepi32_ps(a);
-+ cconst128 = vdupq_n_f32(convconst);
-+ return _mm_mul_ps(af,cconst128);
-+}
-+
-+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
-+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
-+{
-+ float convconst;
-+ __m128 cconst128, af;
-+ convconst = (float)(1. / (1 << b));
-+ af = vcvtq_f32_u32(a);
-+ cconst128 = vdupq_n_f32(convconst);
-+ return _mm_mul_ps(af,cconst128);
-+}
-+
-+//**************Convert between floats ***********************
-+//************************************************************
-+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
-+//Intel SIMD doesn't support 16bits floats curently
-+
-+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
-+//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
-+
-+//************Vector narrow integer conversion (truncation) ******************
-+//****************************************************************************
-+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
-+_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
-+ return64(res);
-+}
-+
-+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
-+_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
-+ res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
-+ return64(res);
-+}
-+
-+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
-+_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
-+{
-+ //may be not effective compared with a serial implementation
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
-+ return64(res);
-+}
-+
-+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
-+#define vmovn_u16 vmovn_s16
-+
-+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
-+#define vmovn_u32 vmovn_s32
-+
-+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
-+#define vmovn_u64 vmovn_s64
-+
-+//**************** Vector long move ***********************
-+//***********************************************************
-+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
-+#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1
-+
-+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
-+#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1
-+
-+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
-+#define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1
-+
-+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
-+#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1
-+
-+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
-+#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1
-+
-+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
-+#define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1
-+
-+//*************Vector saturating narrow integer*****************
-+//**************************************************************
-+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
-+_NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _mm_packs_epi16(a, a);
-+ return64(res);
-+}
-+
-+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
-+_NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = _mm_packs_epi32(a, a);
-+ return64(res);
-+}
-+
-+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
-+{
-+ int32x2_t res;
-+ _NEON2SSE_ALIGN_16 int64_t atmp[2];
-+ _mm_store_si128((__m128i*)atmp, a);
-+ if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
-+ if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
-+ if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
-+ if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
-+ res.m64_i32[0] = (int32_t)atmp[0];
-+ res.m64_i32[1] = (int32_t)atmp[1];
-+ return res;
-+}
-+
-+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
-+_NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
-+{
-+ //no uint16 to uint8 conversion in SSE, need truncate to max signed first
-+ uint8x8_t res64;
-+ __m128i c7fff, a_trunc;
-+ c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
-+ a_trunc = _mm_and_si128(a, c7fff); // a truncated to max signed
-+ a_trunc = _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
-+ return64(a_trunc);
-+}
-+
-+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
-+_NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
-+{
-+ //no uint32 to uint16 conversion in SSE, need truncate to max signed first
-+ uint16x4_t res64;
-+ __m128i c7fffffff, a_trunc;
-+ c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
-+ a_trunc = _mm_and_si128(a, c7fffffff); // a truncated to max signed
-+ a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
-+ return64(a_trunc);
-+}
-+
-+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
-+_NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
-+{
-+ //serial solution may be faster
-+ uint32x2_t res64;
-+ __m128i res_hi, mask;
-+ mask = _mm_setzero_si128();
-+ res_hi = _mm_srli_epi64(a, 32);
-+ res_hi = _mm_cmpeq_epi32(res_hi, mask);
-+ mask = _mm_cmpeq_epi32(mask,mask); //all fff
-+ mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
-+ res_hi = _mm_or_si128(a, mask);
-+ res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(res_hi);
-+}
-+//************* Vector saturating narrow integer signed->unsigned **************
-+//*****************************************************************************
-+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
-+_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
-+{
-+ uint8x8_t res64;
-+ __m128i res;
-+ res = _mm_packus_epi16(a, a); //use low 64bits only
-+ return64(res);
-+}
-+
-+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
-+_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
-+{
-+ uint16x4_t res64;
-+ __m128i res;
-+ res = _MM_PACKUS1_EPI32(a); //use low 64bits only
-+ return64(res);
-+}
-+
-+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
-+_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
-+{
-+ uint32x2_t res64;
-+ __m128i res_hi,res_lo, zero, cmp;
-+ zero = _mm_setzero_si128();
-+ res_hi = _mm_srli_epi64(a, 32);
-+ cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
-+ res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
-+ cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
-+ res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
-+ res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-+ return64(res_lo);
-+}
-+
-+// ********************************************************
-+// **************** Table look up **************************
-+// ********************************************************
-+//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
-+//in a table and generate a new vector. Indexes out of range return 0.
-+//for Intel SIMD we need to set the MSB to 1 for zero return
-+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ __m128i c7, maskgt, bmask, b128;
-+ c7 = _mm_set1_epi8 (7);
-+ b128 = _pM128i(b);
-+ maskgt = _mm_cmpgt_epi8(b128,c7);
-+ bmask = _mm_or_si128(b128,maskgt);
-+ bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
-+ return64(bmask);
-+}
-+
-+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
-+#define vtbl1_s8 vtbl1_u8
-+
-+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-+#define vtbl1_p8 vtbl1_u8
-+
-+//Special trick to avoid __declspec(align('8')) won't be aligned" error
-+//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b)
-+{
-+ uint8x8_t res64;
-+ __m128i c15, a01, maskgt15, bmask, b128;
-+ c15 = _mm_set1_epi8 (15);
-+ b128 = _pM128i(b);
-+ maskgt15 = _mm_cmpgt_epi8(b128,c15);
-+ bmask = _mm_or_si128(b128, maskgt15);
-+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1]));
-+ a01 = _mm_shuffle_epi8(a01, bmask);
-+ return64(a01);
-+}
-+#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b)
-+
-+//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+#define vtbl2_s8 vtbl2_u8
-+
-+//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
-+#define vtbl2_p8 vtbl2_u8
-+
-+//Special trick to avoid __declspec(align('16')) won't be aligned" error
-+//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
-+ c15 = _mm_set1_epi8 (15);
-+ c23 = _mm_set1_epi8 (23);
-+ b128 = _pM128i(b);
-+ maskgt23 = _mm_cmpgt_epi8(b128,c23);
-+ bmask = _mm_or_si128(b128, maskgt23);
-+ maskgt15 = _mm_cmpgt_epi8(b128,c15);
-+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
-+ sh0 = _mm_shuffle_epi8(a01, bmask);
-+ sh1 = _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
-+ return64(sh0);
-+}
-+#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b)
-+
-+//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+#define vtbl3_s8 vtbl3_u8
-+
-+//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
-+#define vtbl3_p8 vtbl3_u8
-+
-+//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
-+ c15 = _mm_set1_epi8 (15);
-+ c31 = _mm_set1_epi8 (31);
-+ b128 = _pM128i(b);
-+ maskgt31 = _mm_cmpgt_epi8(b128,c31);
-+ bmask = _mm_or_si128(b128, maskgt31);
-+ maskgt15 = _mm_cmpgt_epi8(b128,c15);
-+ a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
-+ a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3]));
-+ sh0 = _mm_shuffle_epi8(a01, bmask);
-+ sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
-+ return64(sh0);
-+}
-+#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b)
-+
-+//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+#define vtbl4_s8 vtbl4_u8
-+
-+//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
-+#define vtbl4_p8 vtbl4_u8
-+
-+//****************** Extended table look up intrinsics ***************************
-+//**********************************************************************************
-+//VTBX (Vector Table Extension) works in the same way as VTBL do,
-+// except that indexes out of range leave the destination element unchanged.
-+
-+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
-+{
-+ uint8x8_t res64;
-+ __m128i c7, maskgt, sh, c128;
-+ c7 = _mm_set1_epi8 (7);
-+ c128 = _pM128i(c);
-+ maskgt = _mm_cmpgt_epi8(c128,c7);
-+ c7 = _mm_and_si128(maskgt,_pM128i(a));
-+ sh = _mm_shuffle_epi8(_pM128i(b),c128);
-+ sh = _mm_andnot_si128(maskgt,sh);
-+ sh = _mm_or_si128(sh,c7);
-+ return64(sh);
-+}
-+
-+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
-+#define vtbx1_s8 vtbx1_u8
-+
-+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-+#define vtbx1_p8 vtbx1_u8
-+
-+//Special trick to avoid __declspec(align('8')) won't be aligned" error
-+//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c)
-+{
-+ uint8x8_t res64;
-+ __m128i c15, b01, maskgt15, sh, c128;
-+ c15 = _mm_set1_epi8 (15);
-+ c128 = _pM128i(c);
-+ maskgt15 = _mm_cmpgt_epi8(c128, c15);
-+ c15 = _mm_and_si128(maskgt15, _pM128i(a));
-+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1]));
-+ sh = _mm_shuffle_epi8(b01, c128);
-+ sh = _mm_andnot_si128(maskgt15, sh);
-+ sh = _mm_or_si128(sh,c15);
-+ return64(sh);
-+}
-+#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c)
-+
-+//int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+#define vtbx2_s8 vtbx2_u8
-+
-+//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-+#define vtbx2_p8 vtbx2_u8
-+
-+//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
-+ c15 = _mm_set1_epi8 (15);
-+ c23 = _mm_set1_epi8 (23);
-+ c128 = _pM128i(c);
-+ maskgt15 = _mm_cmpgt_epi8(c128,c15);
-+ maskgt23 = _mm_cmpgt_epi8(c128,c23);
-+ c23 = _mm_and_si128(maskgt23, _pM128i(a));
-+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
-+ sh0 = _mm_shuffle_epi8(b01, c128);
-+ sh1 = _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
-+ sh0 = _mm_andnot_si128(maskgt23,sh0);
-+ sh0 = _mm_or_si128(sh0,c23);
-+ return64(sh0);
-+}
-+#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c)
-+
-+//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c);
-+#define vtbx3_s8 vtbx3_u8
-+
-+//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c);
-+#define vtbx3_p8 vtbx3_u8
-+
-+//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c)
-+{
-+ //solution may be not optimal
-+ uint8x8_t res64;
-+ __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
-+ c15 = _mm_set1_epi8 (15);
-+ c31 = _mm_set1_epi8 (31);
-+ c128 = _pM128i(c);
-+ maskgt15 = _mm_cmpgt_epi8(c128,c15);
-+ maskgt31 = _mm_cmpgt_epi8(c128,c31);
-+ c31 = _mm_and_si128(maskgt31, _pM128i(a));
-+
-+ b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
-+ b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3]));
-+ sh0 = _mm_shuffle_epi8(b01, c128);
-+ sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
-+ sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
-+ sh0 = _mm_andnot_si128(maskgt31,sh0);
-+ sh0 = _mm_or_si128(sh0,c31);
-+ return64(sh0);
-+}
-+#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c)
-+
-+//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c);
-+#define vtbx4_s8 vtbx4_u8
-+
-+//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c);
-+#define vtbx4_p8 vtbx4_u8
-+
-+//*************************************************************************************************
-+// *************************** Operations with a scalar value *********************************
-+//*************************************************************************************************
-+
-+//******* Vector multiply accumulate by scalar *************************************************
-+//**********************************************************************************************
-+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
-+{
-+ int16_t c;
-+ int16x4_t scalar;
-+ c = vget_lane_s16(v, l);
-+ scalar = vdup_n_s16(c);
-+ return vmla_s16(a, b, scalar);
-+}
-+
-+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
-+{
-+ int32_t c;
-+ int32x2_t scalar;
-+ c = vget_lane_s32(v, l);
-+ scalar = vdup_n_s32(c);
-+ return vmla_s32(a, b, scalar);
-+}
-+
-+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
-+#define vmla_lane_u16 vmla_lane_s16
-+
-+
-+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
-+#define vmla_lane_u32 vmla_lane_s32
-+
-+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
-+{
-+ float32_t vlane;
-+ float32x2_t c;
-+ vlane = vget_lane_f32(v, l);
-+ c = vdup_n_f32(vlane);
-+ return vmla_f32(a,b,c);
-+}
-+
-+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x8_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdupq_n_s16(vlane);
-+ return vmlaq_s16(a,b,c);
-+}
-+
-+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x4_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdupq_n_s32(vlane);
-+ return vmlaq_s32(a,b,c);
-+}
-+
-+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
-+#define vmlaq_lane_u16 vmlaq_lane_s16
-+
-+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
-+#define vmlaq_lane_u32 vmlaq_lane_s32
-+
-+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
-+{
-+ float32_t vlane;
-+ float32x4_t c;
-+ vlane = vget_lane_f32(v, l);
-+ c = vdupq_n_f32(vlane);
-+ return vmlaq_f32(a,b,c);
-+}
-+
-+//***************** Vector widening multiply accumulate by scalar **********************
-+//***************************************************************************************
-+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmlal_s16(a, b, c);
-+}
-+
-+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vmlal_s32(a, b, c);
-+}
-+
-+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t c;
-+ vlane = vget_lane_u16(v, l);
-+ c = vdup_n_u16(vlane);
-+ return vmlal_u16(a, b, c);
-+}
-+
-+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdup_n_u32(vlane);
-+ return vmlal_u32(a, b, c);
-+}
-+
-+// ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
-+// ************************************************************************************************
-+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vqdmlal_s16(a, b, c);
-+}
-+
-+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
-+{
-+ int32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vqdmlal_s32(a, b, c);
-+}
-+
-+// ****** Vector multiply subtract by scalar *****************
-+// *************************************************************
-+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmls_s16(a, b, c);
-+}
-+
-+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vmls_s32(a, b, c);
-+}
-+
-+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmls_s16(a, b, c);
-+}
-+
-+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdup_n_u32(vlane);
-+ return vmls_u32(a, b, c);
-+}
-+
-+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
-+{
-+ float32_t vlane;
-+ float32x2_t c;
-+ vlane = (float) vget_lane_f32(v, l);
-+ c = vdup_n_f32(vlane);
-+ return vmls_f32(a,b,c);
-+}
-+
-+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x8_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdupq_n_s16(vlane);
-+ return vmlsq_s16(a, b,c);
-+}
-+
-+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x4_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdupq_n_s32(vlane);
-+ return vmlsq_s32(a,b,c);
-+}
-+
-+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x8_t c;
-+ vlane = vget_lane_u16(v, l);
-+ c = vdupq_n_u16(vlane);
-+ return vmlsq_u16(a,b,c);
-+}
-+
-+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x4_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdupq_n_u32(vlane);
-+ return vmlsq_u32(a,b,c);
-+}
-+
-+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
-+{
-+ float32_t vlane;
-+ float32x4_t c;
-+ vlane = (float) vget_lane_f32(v, l);
-+ c = vdupq_n_f32(vlane);
-+ return vmlsq_f32(a,b,c);
-+}
-+
-+// **** Vector widening multiply subtract by scalar ****
-+// ****************************************************
-+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmlsl_s16(a, b, c);
-+}
-+
-+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vmlsl_s32(a, b, c);
-+}
-+
-+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vmlsl_s16(a, b, c);
-+}
-+
-+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t c;
-+ vlane = vget_lane_u32(v, l);
-+ c = vdup_n_u32(vlane);
-+ return vmlsl_u32(a, b, c);
-+}
-+
-+//********* Vector widening saturating doubling multiply subtract by scalar **************************
-+//******************************************************************************************************
-+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
-+{
-+ int16_t vlane;
-+ int16x4_t c;
-+ vlane = vget_lane_s16(v, l);
-+ c = vdup_n_s16(vlane);
-+ return vqdmlsl_s16(a, b, c);
-+}
-+
-+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32_t vlane;
-+ int32x2_t c;
-+ vlane = vget_lane_s32(v, l);
-+ c = vdup_n_s32(vlane);
-+ return vqdmlsl_s32(a, b, c);
-+}
-+//********** Vector multiply with scalar *****************************
-+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
-+{
-+ int16x4_t b16x4;
-+ b16x4 = vdup_n_s16(b);
-+ return vmul_s16(a, b16x4);
-+}
-+
-+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
-+_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
-+{
-+ //serial solution looks faster
-+ int32x2_t b32x2;
-+ b32x2 = vdup_n_s32(b);
-+ return vmul_s32(a, b32x2);
-+}
-+
-+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
-+_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
-+{
-+ float32x2_t b32x2;
-+ b32x2 = vdup_n_f32(b);
-+ return vmul_f32(a, b32x2);
-+}
-+
-+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
-+_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
-+{
-+ uint16x4_t b16x4;
-+ b16x4 = vdup_n_s16(b);
-+ return vmul_s16(a, b16x4);
-+}
-+
-+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
-+_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
-+{
-+ //serial solution looks faster
-+ uint32x2_t b32x2;
-+ b32x2 = vdup_n_u32(b);
-+ return vmul_u32(a, b32x2);
-+}
-+
-+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
-+{
-+ int16x8_t b16x8;
-+ b16x8 = vdupq_n_s16(b);
-+ return vmulq_s16(a, b16x8);
-+}
-+
-+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
-+{
-+ int32x4_t b32x4;
-+ b32x4 = vdupq_n_s32(b);
-+ return vmulq_s32(a, b32x4);
-+}
-+
-+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
-+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
-+{
-+ float32x4_t b32x4;
-+ b32x4 = vdupq_n_f32(b);
-+ return vmulq_f32(a, b32x4);
-+}
-+
-+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
-+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
-+{
-+ uint16x8_t b16x8;
-+ b16x8 = vdupq_n_s16(b);
-+ return vmulq_s16(a, b16x8);
-+}
-+
-+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
-+{
-+ uint32x4_t b32x4;
-+ b32x4 = vdupq_n_u32(b);
-+ return vmulq_u32(a, b32x4);
-+}
-+
-+//********** Vector multiply lane *****************************
-+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
-+_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
-+{
-+ int16x4_t b16x4;
-+ int16_t vlane;
-+ vlane = vget_lane_s16(b, c);
-+ b16x4 = vdup_n_s16(vlane);
-+ return vmul_s16(a, b16x4);
-+}
-+
-+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
-+{
-+ int32x2_t b32x2;
-+ int32_t vlane;
-+ vlane = vget_lane_s32(b, c);
-+ b32x2 = vdup_n_s32(vlane);
-+ return vmul_s32(a, b32x2);
-+}
-+
-+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
-+{
-+ float32x2_t b32x2;
-+ float32_t vlane;
-+ vlane = vget_lane_f32(b, c);
-+ b32x2 = vdup_n_f32(vlane);
-+ return vmul_f32(a, b32x2);
-+}
-+
-+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
-+#define vmul_lane_u16 vmul_lane_s16
-+
-+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
-+#define vmul_lane_u32 vmul_lane_s32
-+
-+int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
-+_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
-+{
-+ int16x8_t b16x8;
-+ int16_t vlane;
-+ vlane = vget_lane_s16(b, c);
-+ b16x8 = vdupq_n_s16(vlane);
-+ return vmulq_s16(a, b16x8);
-+}
-+
-+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
-+{
-+ int32x4_t b32x4;
-+ int32_t vlane;
-+ vlane = vget_lane_s32(b, c);
-+ b32x4 = vdupq_n_s32(vlane);
-+ return vmulq_s32(a, b32x4);
-+}
-+
-+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
-+_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
-+{
-+ float32x4_t b32x4;
-+ float32_t vlane;
-+ vlane = vget_lane_f32(b, c);
-+ b32x4 = vdupq_n_f32(vlane);
-+ return vmulq_f32(a, b32x4);
-+}
-+
-+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
-+#define vmulq_lane_u16 vmulq_lane_s16
-+
-+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
-+#define vmulq_lane_u32 vmulq_lane_s32
-+
-+//**** Vector long multiply with scalar ************
-+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
-+{
-+ int16x4_t b16x4;
-+ b16x4 = vdup_n_s16(val2);
-+ return vmull_s16(vec1, b16x4);
-+}
-+
-+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
-+{
-+ int32x2_t b32x2;
-+ b32x2 = vdup_n_s32(val2);
-+ return vmull_s32(vec1, b32x2);
-+}
-+
-+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
-+{
-+ uint16x4_t b16x4;
-+ b16x4 = vdup_n_s16(val2);
-+ return vmull_s16(vec1, b16x4);
-+}
-+
-+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
-+{
-+ uint32x2_t b32x2;
-+ b32x2 = vdup_n_u32(val2);
-+ return vmull_u32(vec1, b32x2);
-+}
-+
-+//**** Vector long multiply by scalar ****
-+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
-+{
-+ int16_t vlane;
-+ int16x4_t b;
-+ vlane = vget_lane_s16(val2, val3);
-+ b = vdup_n_s16(vlane);
-+ return vmull_s16(vec1, b);
-+}
-+
-+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
-+{
-+ int32_t vlane;
-+ int32x2_t b;
-+ vlane = vget_lane_s32(val2, val3);
-+ b = vdup_n_s32(vlane);
-+ return vmull_s32(vec1, b);
-+}
-+
-+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
-+{
-+ uint16_t vlane;
-+ uint16x4_t b;
-+ vlane = vget_lane_s16(val2, val3);
-+ b = vdup_n_s16(vlane);
-+ return vmull_s16(vec1, b);
-+}
-+
-+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
-+{
-+ uint32_t vlane;
-+ uint32x2_t b;
-+ vlane = vget_lane_u32(val2, val3);
-+ b = vdup_n_u32(vlane);
-+ return vmull_u32(vec1, b);
-+}
-+
-+//********* Vector saturating doubling long multiply with scalar *******************
-+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
-+{
-+ //the serial soulution may be faster due to saturation
-+ int16x4_t b;
-+ b = vdup_n_s16(val2);
-+ return vqdmull_s16(vec1, b);
-+}
-+
-+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t b;
-+ b = vdup_n_s32(val2);
-+ return vqdmull_s32(vec1,b); //slow serial function!!!!
-+}
-+
-+//************* Vector saturating doubling long multiply by scalar ***********************************************
-+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
-+{
-+ int16_t c;
-+ int16x4_t scalar;
-+ c = vget_lane_s16(val2, val3);
-+ scalar = vdup_n_s16(c);
-+ return vqdmull_s16(vec1, scalar);
-+}
-+
-+
-+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32_t c;
-+ int32x2_t scalar;
-+ c = vget_lane_s32(val2, val3);
-+ scalar = vdup_n_s32(c);
-+ return vqdmull_s32(vec1,scalar); //slow serial function!!!!
-+}
-+
-+// *****Vector saturating doubling multiply high with scalar *****
-+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2)
-+{
-+ int16x4_t res64;
-+ return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
-+}
-+
-+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2)
-+{
-+ int32x2_t res64;
-+ return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
-+}
-+
-+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16x8_t scalar;
-+ scalar = vdupq_n_s16(val2);
-+ return vqdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32x4_t scalar;
-+ scalar = vdupq_n_s32(val2);
-+ return vqdmulhq_s32(vec1, scalar);
-+}
-+
-+//***** Vector saturating doubling multiply high by scalar ****************
-+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x4_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdup_n_s16(vlane);
-+ return vqdmulh_s16(vec1, scalar);
-+}
-+
-+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32_t vlane;
-+ int32x2_t scalar;
-+ vlane = vget_lane_s32(val2, val3);
-+ scalar = vdup_n_s32(vlane);
-+ return vqdmulh_s32(vec1, scalar);
-+}
-+
-+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x8_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdupq_n_s16(vlane );
-+ return vqdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //solution may be not optimal
-+ int32_t vlane;
-+ int32x4_t scalar;
-+ vlane = vgetq_lane_s32(_pM128i(val2), val3);
-+ scalar = vdupq_n_s32(vlane );
-+ return vqdmulhq_s32(vec1, scalar);
-+}
-+
-+//******** Vector saturating rounding doubling multiply high with scalar ***
-+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16x4_t scalar;
-+ scalar = vdup_n_s16(val2);
-+ return vqrdmulh_s16(vec1, scalar);
-+}
-+
-+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32x2_t scalar;
-+ scalar = vdup_n_s32(val2);
-+ return vqrdmulh_s32(vec1, scalar);
-+}
-+
-+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16x8_t scalar;
-+ scalar = vdupq_n_s16(val2);
-+ return vqrdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32x4_t scalar;
-+ scalar = vdupq_n_s32(val2);
-+ return vqrdmulhq_s32(vec1, scalar);
-+}
-+
-+//********* Vector rounding saturating doubling multiply high by scalar ****
-+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
-+_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x4_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdup_n_s16(vlane);
-+ return vqrdmulh_s16(vec1, scalar);
-+}
-+
-+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ int32_t vlane;
-+ int32x2_t scalar;
-+ vlane = vget_lane_s32(val2, val3);
-+ scalar = vdup_n_s32(vlane);
-+ return vqrdmulh_s32(vec1, scalar);
-+}
-+
-+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
-+_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
-+{
-+ //solution may be not optimal
-+ int16_t vlane;
-+ int16x8_t scalar;
-+ vlane = vget_lane_s16(val2, val3);
-+ scalar = vdupq_n_s16(vlane);
-+ return vqrdmulhq_s16(vec1, scalar);
-+}
-+
-+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-+{
-+ //solution may be not optimal
-+ int32_t vlane;
-+ int32x4_t scalar;
-+ vlane = vgetq_lane_s32(_pM128i(val2), val3);
-+ scalar = vdupq_n_s32(vlane );
-+ return vqrdmulhq_s32(vec1, scalar);
-+}
-+
-+//**************Vector multiply accumulate with scalar *******************
-+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
-+{
-+ int16x4_t scalar;
-+ scalar = vdup_n_s16(c);
-+ return vmla_s16(a, b, scalar);
-+}
-+
-+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
-+{
-+ int32x2_t scalar;
-+ scalar = vdup_n_s32(c);
-+ return vmla_s32(a, b, scalar);
-+}
-+
-+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
-+#define vmla_n_u16 vmla_n_s16
-+
-+
-+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
-+#define vmla_n_u32 vmla_n_s32
-+
-+
-+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
-+{
-+ float32x2_t scalar;
-+ scalar = vdup_n_f32(c);
-+ return vmla_f32(a, b, scalar);
-+}
-+
-+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
-+{
-+ int16x8_t scalar;
-+ scalar = vdupq_n_s16(c);
-+ return vmlaq_s16(a,b,scalar);
-+}
-+
-+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
-+{
-+ int32x4_t scalar;
-+ scalar = vdupq_n_s32(c);
-+ return vmlaq_s32(a,b,scalar);
-+}
-+
-+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
-+#define vmlaq_n_u16 vmlaq_n_s16
-+
-+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
-+#define vmlaq_n_u32 vmlaq_n_s32
-+
-+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
-+{
-+ float32x4_t scalar;
-+ scalar = vdupq_n_f32(c);
-+ return vmlaq_f32(a,b,scalar);
-+}
-+
-+//************Vector widening multiply accumulate with scalar****************************
-+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmlal_s16(a, b, vc);
-+}
-+
-+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vmlal_s32(a, b, vc);
-+}
-+
-+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
-+{
-+ uint16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmlal_s16(a, b, vc);
-+}
-+
-+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
-+{
-+ uint32x2_t vc;
-+ vc = vdup_n_u32(c);
-+ return vmlal_u32(a, b, vc);
-+}
-+
-+//************ Vector widening saturating doubling multiply accumulate with scalar **************
-+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
-+{
-+ //not optimal SIMD soulution, serial may be faster
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vqdmlal_s16(a, b, vc);
-+}
-+
-+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vqdmlal_s32(a, b, vc);
-+}
-+
-+//******** Vector multiply subtract with scalar **************
-+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmls_s16(a, b, vc);
-+}
-+
-+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vmls_s32(a, b, vc);
-+}
-+
-+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
-+{
-+ uint16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmls_s16(a, b, vc);
-+}
-+
-+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
-+{
-+ uint32x2_t vc;
-+ vc = vdup_n_u32(c);
-+ return vmls_u32(a, b, vc);
-+}
-+
-+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
-+_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
-+ res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
-+ return res;
-+}
-+
-+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
-+{
-+ int16x8_t vc;
-+ vc = vdupq_n_s16(c);
-+ return vmlsq_s16(a, b,vc);
-+}
-+
-+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
-+{
-+ int32x4_t vc;
-+ vc = vdupq_n_s32(c);
-+ return vmlsq_s32(a,b,vc);
-+}
-+
-+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
-+{
-+ uint32x4_t vc;
-+ vc = vdupq_n_u32(c);
-+ return vmlsq_u32(a,b,vc);
-+}
-+
-+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
-+{
-+ uint32x4_t vc;
-+ vc = vdupq_n_u32(c);
-+ return vmlsq_u32(a,b,vc);
-+}
-+
-+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
-+_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
-+{
-+ float32x4_t vc;
-+ vc = vdupq_n_f32(c);
-+ return vmlsq_f32(a,b,vc);
-+}
-+
-+//**** Vector widening multiply subtract with scalar ******
-+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vmlsl_s16(a, b, vc);
-+}
-+
-+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vmlsl_s32(a, b, vc);
-+}
-+
-+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
-+{
-+ uint16x4_t vc;
-+ vc = vdup_n_u16(c);
-+ return vmlsl_u16(a, b, vc);
-+}
-+
-+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
-+_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
-+{
-+ uint32x2_t vc;
-+ vc = vdup_n_u32(c);
-+ return vmlsl_u32(a, b, vc);
-+}
-+
-+//***** Vector widening saturating doubling multiply subtract with scalar *********
-+//**********************************************************************************
-+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
-+_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
-+{
-+ int16x4_t vc;
-+ vc = vdup_n_s16(c);
-+ return vqdmlsl_s16(a, b, vc);
-+}
-+
-+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t vc;
-+ vc = vdup_n_s32(c);
-+ return vqdmlsl_s32(a, b, vc);
-+}
-+
-+//******************* Vector extract ***********************************************
-+//*************************************************************************************
-+//VEXT (Vector Extract) extracts elements from the bottom end of the second operand
-+//vector and the top end of the first, concatenates them, and places the result in the destination vector
-+//c elements from the bottom end of the second operand and (8-c) from the top end of the first
-+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int8x8_t res;
-+ int i;
-+ for (i = 0; i<8 - c; i++) {
-+ res.m64_i8[i] = a.m64_i8[i + c];
-+ }
-+ for(i = 0; i<c; i++) {
-+ res.m64_i8[8 - c + i] = b.m64_i8[i];
-+ }
-+ return res;
-+}
-+
-+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+#define vext_u8 vext_s8
-+//same result tested
-+
-+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-+#define vext_p8 vext_u8
-+
-+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+_NEON2SSE_INLINE int16x4_t _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int16x4_t res;
-+ int i;
-+ for (i = 0; i<4 - c; i++) {
-+ res.m64_i16[i] = a.m64_i16[i + c];
-+ }
-+ for(i = 0; i<c; i++) {
-+ res.m64_i16[4 - c + i] = b.m64_i16[i];
-+ }
-+ return res;
-+}
-+
-+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+#define vext_u16 vext_s16
-+
-+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-+#define vext_p16 vext_s16
-+
-+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ int32x2_t res;
-+ if (c==0) {
-+ res.m64_i32[0] = a.m64_i32[0];
-+ res.m64_i32[1] = a.m64_i32[1];
-+ } else {
-+ res.m64_i32[0] = a.m64_i32[1];
-+ res.m64_i32[1] = b.m64_i32[0];
-+ }
-+ return res;
-+}
-+
-+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
-+{
-+ float32x2_t res;
-+ if (c==0) {
-+ res.m64_f32[0] = a.m64_f32[0];
-+ res.m64_f32[1] = a.m64_f32[1];
-+ } else {
-+ res.m64_f32[0] = a.m64_f32[1];
-+ res.m64_f32[1] = b.m64_f32[0];
-+ }
-+ return res;
-+}
-+
-+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-+#define vext_u32 vext_s32
-+
-+
-+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+#define vext_s64(a,b,c) a
-+
-+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-+#define vext_u64(a,b,c) a
-+
-+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+#define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
-+
-+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+#define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
-+
-+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-+#define vextq_p8 vextq_s8
-+
-+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+#define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
-+
-+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+#define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
-+
-+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-+#define vextq_p16 vextq_s16
-+
-+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+#define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
-+
-+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-+#define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
-+
-+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
-+#define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
-+
-+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+#define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
-+
-+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-+#define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
-+
-+//************ Reverse vector elements (swap endianness)*****************
-+//*************************************************************************
-+//VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
-+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vrev64q_s8(_pM128i(vec));
-+ return64(res);
-+}
-+
-+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vrev64q_s16(_pM128i(vec));
-+ return64(res);
-+}
-+
-+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
-+{
-+ int32x2_t res;
-+ res.m64_i32[0] = vec.m64_i32[1];
-+ res.m64_i32[1] = vec.m64_i32[0];
-+ return res;
-+}
-+
-+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
-+#define vrev64_u8 vrev64_s8
-+
-+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
-+#define vrev64_u16 vrev64_s16
-+
-+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
-+#define vrev64_u32 vrev64_s32
-+
-+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
-+#define vrev64_p8 vrev64_u8
-+
-+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
-+#define vrev64_p16 vrev64_u16
-+
-+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
-+{
-+ float32x2_t res;
-+ res.m64_f32[0] = vec.m64_f32[1];
-+ res.m64_f32[1] = vec.m64_f32[0];
-+ return res;
-+}
-+
-+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
-+}
-+
-+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
-+{
-+ //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
-+}
-+
-+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
-+{
-+ return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
-+}
-+
-+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
-+#define vrev64q_u8 vrev64q_s8
-+
-+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
-+#define vrev64q_u16 vrev64q_s16
-+
-+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
-+#define vrev64q_u32 vrev64q_s32
-+
-+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
-+#define vrev64q_p8 vrev64q_u8
-+
-+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
-+#define vrev64q_p16 vrev64q_u16
-+
-+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
-+#define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1))
-+
-+//******************** 32 bit shuffles **********************
-+//************************************************************
-+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vrev32q_s8(_pM128i(vec));
-+ return64(res);
-+}
-+
-+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vrev32q_s16(_pM128i(vec));
-+ return64(res);
-+}
-+
-+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
-+#define vrev32_u8 vrev32_s8
-+
-+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
-+#define vrev32_u16 vrev32_s16
-+
-+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
-+#define vrev32_p8 vrev32_u8
-+
-+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
-+#define vrev32_p16 vrev32_u16
-+
-+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
-+}
-+
-+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
-+}
-+
-+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
-+#define vrev32q_u8 vrev32q_s8
-+
-+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
-+#define vrev32q_u16 vrev32q_s16
-+
-+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
-+#define vrev32q_p8 vrev32q_u8
-+
-+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
-+#define vrev32q_p16 vrev32q_u16
-+
-+//************* 16 bit shuffles **********************
-+//******************************************************
-+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vrev16q_s8(_pM128i(vec));
-+ return64(res);
-+}
-+
-+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
-+#define vrev16_u8 vrev16_s8
-+
-+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
-+#define vrev16_p8 vrev16_u8
-+
-+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
-+ return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev8);
-+}
-+
-+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
-+#define vrev16q_u8 vrev16q_s8
-+
-+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
-+#define vrev16q_p8 vrev16q_u8
-+
-+//*********************************************************************
-+//**************** Other single operand arithmetic *******************
-+//*********************************************************************
-+
-+//*********** Absolute: Vd[i] = |Va[i]| **********************************
-+//************************************************************************
-+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vabs_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = _mm_abs_epi8(_pM128i(a));
-+ return64(res);
-+}
-+
-+
-+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vabs_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = _mm_abs_epi16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vabs_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = _mm_abs_epi32(_pM128i(a));
-+ return64(res);
-+}
-+
-+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
-+_NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-+ res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
-+#define vabsq_s8 _mm_abs_epi8
-+
-+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
-+#define vabsq_s16 _mm_abs_epi16
-+
-+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
-+#define vabsq_s32 _mm_abs_epi32
-+
-+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
-+_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-+ return _mm_and_ps (a, *(__m128*)c7fffffff);
-+}
-+
-+//****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
-+//**********************************************************************
-+//For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
-+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vqabsq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vqabsq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vqabsq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
-+{
-+ __m128i c_128, abs, abs_cmp;
-+ c_128 = _mm_set1_epi8 (0x80); //-128
-+ abs = _mm_abs_epi8 (a);
-+ abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
-+ return _mm_xor_si128 (abs, abs_cmp);
-+}
-+
-+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
-+{
-+ __m128i c_32768, abs, abs_cmp;
-+ c_32768 = _mm_set1_epi16 (0x8000); //-32768
-+ abs = _mm_abs_epi16 (a);
-+ abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
-+ return _mm_xor_si128 (abs, abs_cmp);
-+}
-+
-+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
-+{
-+ __m128i c80000000, abs, abs_cmp;
-+ c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
-+ abs = _mm_abs_epi32 (a);
-+ abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
-+ return _mm_xor_si128 (abs, abs_cmp);
-+}
-+
-+//*************** Negate: Vd[i] = - Va[i] *************************************
-+//*****************************************************************************
-+//several Negate implementations possible for SIMD.
-+//e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
-+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vnegq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vnegq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vnegq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
-+_NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
-+{
-+ float32x4_t res;
-+ __m64_128 res64;
-+ _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
-+ _M64f(res64, res);
-+ return res64;
-+}
-+
-+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_sub_epi8 (zero, a);
-+} //or _mm_sign_epi8 (a, negative numbers vector)
-+
-+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_sub_epi16 (zero, a);
-+} //or _mm_sign_epi16 (a, negative numbers vector)
-+
-+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_sub_epi32 (zero, a);
-+} //or _mm_sign_epi32 (a, negative numbers vector)
-+
-+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
-+_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
-+{
-+ _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-+ return _mm_xor_ps (a, *(__m128*) c80000000);
-+}
-+
-+//************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
-+//***************************************************************************************
-+//For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
-+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
-+_NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vqnegq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
-+_NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vqnegq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
-+_NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vqnegq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
-+_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_subs_epi8 (zero, a); //saturating substraction
-+}
-+
-+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
-+_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
-+{
-+ __m128i zero;
-+ zero = _mm_setzero_si128 ();
-+ return _mm_subs_epi16 (zero, a); //saturating substraction
-+}
-+
-+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
-+_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
-+{
-+ //solution may be not optimal compared with a serial
-+ __m128i c80000000, zero, sub, cmp;
-+ c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
-+ zero = _mm_setzero_si128 ();
-+ sub = _mm_sub_epi32 (zero, a); //substraction
-+ cmp = _mm_cmpeq_epi32 (a, c80000000);
-+ return _mm_xor_si128 (sub, cmp);
-+}
-+
-+//****************** Count leading zeros ********************************
-+//**************************************************************************
-+//no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
-+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vclzq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vclzq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vclzq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+
-+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
-+#define vclz_u8 vclz_s8
-+
-+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
-+#define vclz_u16 vclz_s16
-+
-+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
-+#define vclz_u32 vclz_s32
-+
-+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
-+ /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
-+ /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
-+ /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0 };
-+ __m128i maskLOW, c4, lowclz, mask, hiclz;
-+ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
-+ c4 = _mm_set1_epi8(4);
-+ lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
-+ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
-+ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
-+ hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
-+ mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
-+ lowclz = _mm_and_si128(lowclz,mask);
-+ return _mm_add_epi8(lowclz, hiclz);
-+}
-+
-+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
-+{
-+ __m128i c7, res8x16, res8x16_swap;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-+ _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
-+ c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
-+ res8x16 = vclzq_s8(a);
-+ res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
-+ res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
-+ res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
-+ c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
-+ res8x16 = _mm_and_si128(res8x16, c7); //lowclz
-+ return _mm_add_epi16(res8x16_swap, res8x16);
-+}
-+
-+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
-+{
-+ __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
-+ c55555555 = _mm_set1_epi32(0x55555555);
-+ c33333333 = _mm_set1_epi32(0x33333333);
-+ c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
-+ c3f = _mm_set1_epi32(0x3f);
-+ c32 = _mm_set1_epi32(32);
-+ tmp = _mm_srli_epi32(a, 1);
-+ res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
-+ tmp = _mm_srli_epi32(res, 2);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
-+ tmp = _mm_srli_epi32(res, 4);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
-+ tmp = _mm_srli_epi32(res, 8);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
-+ tmp = _mm_srli_epi32(res, 16);
-+ res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
-+
-+ tmp = _mm_srli_epi32(res, 1);
-+ tmp = _mm_and_si128(tmp, c55555555);
-+ res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
-+
-+ tmp = _mm_srli_epi32(res, 2);
-+ tmp = _mm_and_si128(tmp, c33333333);
-+ tmp1 = _mm_and_si128(res, c33333333);
-+ res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
-+
-+ tmp = _mm_srli_epi32(res, 4);
-+ tmp = _mm_add_epi32(tmp, res);
-+ res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
-+
-+ tmp = _mm_srli_epi32(res, 8);
-+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
-+
-+ tmp = _mm_srli_epi32(res, 16);
-+ res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
-+
-+ res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
-+
-+ return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
-+}
-+
-+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
-+#define vclzq_u8 vclzq_s8
-+
-+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
-+#define vclzq_u16 vclzq_s16
-+
-+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
-+#define vclzq_u32 vclzq_s32
-+
-+//************** Count leading sign bits **************************
-+//********************************************************************
-+//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
-+// the topmost bit, that are the same as the topmost bit, in each element in a vector
-+//No corresponding vector intrinsics in IA32, need to implement it.
-+//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
-+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
-+_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vclsq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
-+_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vclsq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
-+_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vclsq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
-+_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
-+{
-+ __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
-+ cff = _mm_cmpeq_epi8 (a,a); //0xff
-+ c80 = _mm_set1_epi8(0x80);
-+ c1 = _mm_set1_epi8(1);
-+ a_mask = _mm_and_si128(a, c80);
-+ a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
-+ a_neg = _mm_xor_si128(a, cff);
-+ a_neg = _mm_and_si128(a_mask, a_neg);
-+ a_pos = _mm_andnot_si128(a_mask, a);
-+ a_comb = _mm_or_si128(a_pos, a_neg);
-+ a_comb = vclzq_s8(a_comb);
-+ return _mm_sub_epi8(a_comb, c1);
-+}
-+
-+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
-+_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
-+{
-+ __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
-+ cffff = _mm_cmpeq_epi16(a,a);
-+ c8000 = _mm_slli_epi16(cffff, 15); //0x8000
-+ c1 = _mm_srli_epi16(cffff,15); //0x1
-+ a_mask = _mm_and_si128(a, c8000);
-+ a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
-+ a_neg = _mm_xor_si128(a, cffff);
-+ a_neg = _mm_and_si128(a_mask, a_neg);
-+ a_pos = _mm_andnot_si128(a_mask, a);
-+ a_comb = _mm_or_si128(a_pos, a_neg);
-+ a_comb = vclzq_s16(a_comb);
-+ return _mm_sub_epi16(a_comb, c1);
-+}
-+
-+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
-+_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
-+{
-+ __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
-+ cffffffff = _mm_cmpeq_epi32(a,a);
-+ c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000
-+ c1 = _mm_srli_epi32(cffffffff,31); //0x1
-+ a_mask = _mm_and_si128(a, c80000000);
-+ a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
-+ a_neg = _mm_xor_si128(a, cffffffff);
-+ a_neg = _mm_and_si128(a_mask, a_neg);
-+ a_pos = _mm_andnot_si128(a_mask, a);
-+ a_comb = _mm_or_si128(a_pos, a_neg);
-+ a_comb = vclzq_s32(a_comb);
-+ return _mm_sub_epi32(a_comb, c1);
-+}
-+
-+//************************* Count number of set bits ********************************
-+//*************************************************************************************
-+//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element
-+//another option is to do the following algorithm:
-+
-+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
-+_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
-+{
-+ uint8x8_t res64;
-+ __m128i res;
-+ res = vcntq_u8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
-+#define vcnt_s8 vcnt_u8
-+
-+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
-+#define vcnt_p8 vcnt_u8
-+
-+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
-+_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
-+{
-+ _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
-+ /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
-+ /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
-+ /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 };
-+ __m128i maskLOW, mask, lowpopcnt, hipopcnt;
-+ maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
-+ mask = _mm_and_si128(a, maskLOW);
-+ lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
-+ mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
-+ mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
-+ hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
-+ return _mm_add_epi8(lowpopcnt, hipopcnt);
-+}
-+
-+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
-+#define vcntq_s8 vcntq_u8
-+
-+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
-+#define vcntq_p8 vcntq_u8
-+
-+//**************************************************************************************
-+//*********************** Logical operations ****************************************
-+//**************************************************************************************
-+//************************** Bitwise not ***********************************
-+//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
-+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
-+_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vmvnq_s8(_pM128i(a));
-+ return64(res);
-+}
-+
-+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
-+_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
-+{
-+ int16x4_t res64;
-+ __m128i res;
-+ res = vmvnq_s16(_pM128i(a));
-+ return64(res);
-+}
-+
-+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
-+_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
-+{
-+ int32x2_t res64;
-+ __m128i res;
-+ res = vmvnq_s32(_pM128i(a));
-+ return64(res);
-+}
-+
-+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
-+#define vmvn_u8 vmvn_s8
-+
-+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
-+#define vmvn_u16 vmvn_s16
-+
-+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
-+#define vmvn_u32 vmvn_s32
-+
-+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
-+#define vmvn_p8 vmvn_u8
-+
-+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
-+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
-+{
-+ __m128i c1;
-+ c1 = _mm_cmpeq_epi8 (a,a); //0xff
-+ return _mm_andnot_si128 (a, c1);
-+}
-+
-+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
-+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
-+{
-+ __m128i c1;
-+ c1 = _mm_cmpeq_epi16 (a,a); //0xffff
-+ return _mm_andnot_si128 (a, c1);
-+}
-+
-+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
-+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
-+{
-+ __m128i c1;
-+ c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-+ return _mm_andnot_si128 (a, c1);
-+}
-+
-+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
-+#define vmvnq_u8 vmvnq_s8
-+
-+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
-+#define vmvnq_u16 vmvnq_s16
-+
-+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
-+#define vmvnq_u32 vmvnq_s32
-+
-+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
-+#define vmvnq_p8 vmvnq_u8
-+
-+//****************** Bitwise and ***********************
-+//******************************************************
-+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
-+ return res;
-+}
-+
-+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
-+#define vand_u8 vand_s8
-+
-+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
-+#define vand_u16 vand_s16
-+
-+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
-+#define vand_u32 vand_s32
-+
-+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
-+#define vand_u64 vand_s64
-+
-+
-+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
-+#define vandq_s8 _mm_and_si128
-+
-+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
-+#define vandq_s16 _mm_and_si128
-+
-+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
-+#define vandq_s32 _mm_and_si128
-+
-+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
-+#define vandq_s64 _mm_and_si128
-+
-+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
-+#define vandq_u8 _mm_and_si128
-+
-+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
-+#define vandq_u16 _mm_and_si128
-+
-+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
-+#define vandq_u32 _mm_and_si128
-+
-+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
-+#define vandq_u64 _mm_and_si128
-+
-+//******************** Bitwise or *********************************
-+//******************************************************************
-+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+
-+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
-+ return res;
-+}
-+
-+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
-+#define vorr_u8 vorr_s8
-+
-+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
-+#define vorr_u16 vorr_s16
-+
-+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
-+#define vorr_u32 vorr_s32
-+
-+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
-+#define vorr_u64 vorr_s64
-+
-+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
-+#define vorrq_s8 _mm_or_si128
-+
-+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
-+#define vorrq_s16 _mm_or_si128
-+
-+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
-+#define vorrq_s32 _mm_or_si128
-+
-+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
-+#define vorrq_s64 _mm_or_si128
-+
-+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
-+#define vorrq_u8 _mm_or_si128
-+
-+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
-+#define vorrq_u16 _mm_or_si128
-+
-+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
-+#define vorrq_u32 _mm_or_si128
-+
-+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
-+#define vorrq_u64 _mm_or_si128
-+
-+//************* Bitwise exclusive or (EOR or XOR) ******************
-+//*******************************************************************
-+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
-+}
-+
-+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
-+#define veor_s16 veor_s8
-+
-+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
-+#define veor_s32 veor_s8
-+
-+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
-+ return res;
-+}
-+
-+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
-+#define veor_u8 veor_s8
-+
-+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
-+#define veor_u16 veor_s16
-+
-+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
-+#define veor_u32 veor_s32
-+
-+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
-+#define veor_u64 veor_s64
-+
-+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
-+#define veorq_s8 _mm_xor_si128
-+
-+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
-+#define veorq_s16 _mm_xor_si128
-+
-+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
-+#define veorq_s32 _mm_xor_si128
-+
-+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
-+#define veorq_s64 _mm_xor_si128
-+
-+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
-+#define veorq_u8 _mm_xor_si128
-+
-+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
-+#define veorq_u16 _mm_xor_si128
-+
-+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
-+#define veorq_u32 _mm_xor_si128
-+
-+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
-+#define veorq_u64 _mm_xor_si128
-+
-+//********************** Bit Clear **********************************
-+//*******************************************************************
-+//Logical AND complement (AND negation or AND NOT)
-+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
-+}
-+
-+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
-+#define vbic_s16 vbic_s8
-+
-+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
-+#define vbic_s32 vbic_s8
-+
-+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
-+ return res;
-+}
-+
-+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
-+#define vbic_u8 vbic_s8
-+
-+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
-+#define vbic_u16 vbic_s16
-+
-+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
-+#define vbic_u32 vbic_s32
-+
-+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
-+#define vbic_u64 vbic_s64
-+
-+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
-+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
-+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
-+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
-+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
-+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
-+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
-+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
-+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
-+
-+//**************** Bitwise OR complement ********************************
-+//**************************************** ********************************
-+//no exact IA 32 match, need to implement it as following
-+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b)
-+{
-+ int8x8_t res64;
-+ return64(vornq_s8(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b)
-+{
-+ int16x4_t res64;
-+ return64(vornq_s16(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2_t res64;
-+ return64(vornq_s32(_pM128i(a), _pM128i(b)));
-+}
-+
-+
-+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
-+ return res;
-+}
-+
-+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
-+#define vorn_u8 vorn_s8
-+
-+
-+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
-+#define vorn_u16 vorn_s16
-+
-+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
-+#define vorn_u32 vorn_s32
-+
-+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
-+#define vorn_u64 vorn_s64
-+
-+
-+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s8( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s16( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s32( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
-+{
-+ __m128i c1, b1;
-+ c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
-+ b1 = _mm_andnot_si128 (b, c1);
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_u8( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_s16( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+
-+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
-+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
-+{
-+ __m128i b1;
-+ b1 = vmvnq_u32( b); //bitwise not for b
-+ return _mm_or_si128 (a, b1);
-+}
-+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
-+#define vornq_u64 vornq_s64
-+
-+//********************* Bitwise Select *****************************
-+//******************************************************************
-+//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
-+
-+//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
-+//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
-+
-+//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
-+//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
-+
-+//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
-+//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
-+
-+//VBSL only is implemented for SIMD
-+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
-+_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
-+{
-+ int8x8_t res64;
-+ __m128i res;
-+ res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
-+ return64(res);
-+}
-+
-+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
-+#define vbsl_s16 vbsl_s8
-+
-+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
-+#define vbsl_s32 vbsl_s8
-+
-+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
-+_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
-+{
-+ int64x1_t res;
-+ res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
-+ return res;
-+}
-+
-+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
-+#define vbsl_u8 vbsl_s8
-+
-+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
-+#define vbsl_u16 vbsl_s8
-+
-+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
-+#define vbsl_u32 vbsl_s8
-+
-+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
-+#define vbsl_u64 vbsl_s64
-+
-+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
-+_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
-+{
-+ __m128 sel1, sel2;
-+ __m64_128 res64;
-+ sel1 = _mm_and_ps (_pM128(a), _pM128(b));
-+ sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
-+ sel1 = _mm_or_ps (sel1, sel2);
-+ _M64f(res64, sel1);
-+ return res64;
-+}
-+
-+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
-+#define vbsl_p8 vbsl_s8
-+
-+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
-+#define vbsl_p16 vbsl_s8
-+
-+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
-+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
-+{
-+ __m128i sel1, sel2;
-+ sel1 = _mm_and_si128 (a, b);
-+ sel2 = _mm_andnot_si128 (a, c);
-+ return _mm_or_si128 (sel1, sel2);
-+}
-+
-+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
-+#define vbslq_s16 vbslq_s8
-+
-+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
-+#define vbslq_s32 vbslq_s8
-+
-+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
-+#define vbslq_s64 vbslq_s8
-+
-+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
-+#define vbslq_u8 vbslq_s8
-+
-+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
-+#define vbslq_u16 vbslq_s8
-+
-+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
-+#define vbslq_u32 vbslq_s8
-+
-+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
-+#define vbslq_u64 vbslq_s8
-+
-+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
-+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
-+{
-+ __m128 sel1, sel2;
-+ sel1 = _mm_and_ps (*(__m128*)&a, b);
-+ sel2 = _mm_andnot_ps (*(__m128*)&a, c);
-+ return _mm_or_ps (sel1, sel2);
-+}
-+
-+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
-+#define vbslq_p8 vbslq_u8
-+
-+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
-+#define vbslq_p16 vbslq_s8
-+
-+//************************************************************************************
-+//**************** Transposition operations ****************************************
-+//************************************************************************************
-+//***************** Vector Transpose ************************************************
-+//************************************************************************************
-+//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
-+// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
-+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
-+_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
-+{
-+ int8x8x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp
-+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
-+ vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
-+ return val;
-+}
-+
-+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
-+_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
-+{
-+ int16x4x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
-+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
-+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
-+ return val;
-+}
-+
-+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
-+_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
-+{
-+ int32x2x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
-+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
-+ return val;
-+}
-+
-+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
-+#define vtrn_u8 vtrn_s8
-+
-+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
-+#define vtrn_u16 vtrn_s16
-+
-+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
-+#define vtrn_u32 vtrn_s32
-+
-+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
-+_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
-+{
-+ float32x2x2_t val;
-+ val.val[0].m64_f32[0] = a.m64_f32[0];
-+ val.val[0].m64_f32[1] = b.m64_f32[0];
-+ val.val[1].m64_f32[0] = a.m64_f32[1];
-+ val.val[1].m64_f32[1] = b.m64_f32[1];
-+ return val; //a0,b0,a1,b1
-+}
-+
-+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
-+#define vtrn_p8 vtrn_u8
-+
-+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
-+#define vtrn_p16 vtrn_s16
-+
-+//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
-+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
-+{
-+ int8x16x2_t r8x16;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
-+
-+ r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
-+ r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
-+ return r8x16;
-+}
-+
-+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
-+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
-+{
-+ int16x8x2_t v16x8;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
-+ v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
-+ v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
-+ return v16x8;
-+}
-+
-+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
-+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
-+{
-+ //may be not optimal solution compared with serial
-+ int32x4x2_t v32x4;
-+ __m128i a_sh, b_sh;
-+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
-+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
-+
-+ v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
-+ v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3
-+ return v32x4;
-+}
-+
-+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
-+#define vtrnq_u8 vtrnq_s8
-+
-+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
-+#define vtrnq_u16 vtrnq_s16
-+
-+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
-+#define vtrnq_u32 vtrnq_s32
-+
-+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
-+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
-+{
-+ //may be not optimal solution compared with serial
-+ float32x4x2_t f32x4;
-+ __m128 a_sh, b_sh;
-+ a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
-+ b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
-+
-+ f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
-+ f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3
-+ return f32x4;
-+}
-+
-+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
-+#define vtrnq_p8 vtrnq_s8
-+
-+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
-+#define vtrnq_p16 vtrnq_s16
-+
-+//***************** Interleave elements ***************************
-+//*****************************************************************
-+//output has (a0,b0,a1,b1, a2,b2,.....)
-+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
-+_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
-+{
-+ int8x8x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
-+ vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
-+_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
-+{
-+ int16x4x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
-+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
-+#define vzip_s32 vtrn_s32
-+
-+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
-+#define vzip_u8 vzip_s8
-+
-+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
-+#define vzip_u16 vzip_s16
-+
-+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
-+#define vzip_u32 vzip_s32
-+
-+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
-+#define vzip_f32 vtrn_f32
-+
-+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
-+#define vzip_p8 vzip_u8
-+
-+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
-+#define vzip_p16 vzip_u16
-+
-+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
-+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
-+{
-+ int8x16x2_t r8x16;
-+ r8x16.val[0] = _mm_unpacklo_epi8(a, b);
-+ r8x16.val[1] = _mm_unpackhi_epi8(a, b);
-+ return r8x16;
-+}
-+
-+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
-+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
-+{
-+ int16x8x2_t r16x8;
-+ r16x8.val[0] = _mm_unpacklo_epi16(a, b);
-+ r16x8.val[1] = _mm_unpackhi_epi16(a, b);
-+ return r16x8;
-+}
-+
-+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
-+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
-+{
-+ int32x4x2_t r32x4;
-+ r32x4.val[0] = _mm_unpacklo_epi32(a, b);
-+ r32x4.val[1] = _mm_unpackhi_epi32(a, b);
-+ return r32x4;
-+}
-+
-+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
-+#define vzipq_u8 vzipq_s8
-+
-+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
-+#define vzipq_u16 vzipq_s16
-+
-+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
-+#define vzipq_u32 vzipq_s32
-+
-+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
-+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
-+{
-+ float32x4x2_t f32x4;
-+ f32x4.val[0] = _mm_unpacklo_ps ( a, b);
-+ f32x4.val[1] = _mm_unpackhi_ps ( a, b);
-+ return f32x4;
-+}
-+
-+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
-+#define vzipq_p8 vzipq_u8
-+
-+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
-+#define vzipq_p16 vzipq_u16
-+
-+//*********************** De-Interleave elements *************************
-+//*************************************************************************
-+//As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
-+//no such functions in IA32 SIMD, shuffle is required
-+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
-+_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
-+{
-+ int8x8x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15};
-+ tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7)
-+ vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
-+_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
-+{
-+ int16x4x2_t val;
-+ __m128i tmp, val0;
-+ _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
-+ tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
-+ val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
-+ vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
-+_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
-+{
-+ int32x2x2_t val;
-+ __m128i val0;
-+ val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
-+ vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
-+ return val;
-+}
-+
-+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
-+#define vuzp_u8 vuzp_s8
-+
-+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
-+#define vuzp_u16 vuzp_s16
-+
-+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
-+#define vuzp_u32 vuzp_s32
-+
-+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
-+#define vuzp_f32 vzip_f32
-+
-+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
-+#define vuzp_p8 vuzp_u8
-+
-+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
-+#define vuzp_p16 vuzp_u16
-+
-+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
-+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
-+{
-+ int8x16x2_t v8x16;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
-+ //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
-+ v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14,
-+ v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15
-+ return v8x16;
-+}
-+
-+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
-+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
-+{
-+ int16x8x2_t v16x8;
-+ __m128i a_sh, b_sh;
-+ _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-+ a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
-+ b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
-+ v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
-+ v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
-+ return v16x8;
-+}
-+
-+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
-+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
-+{
-+ //may be not optimal solution compared with serial
-+ int32x4x2_t v32x4;
-+ __m128i a_sh, b_sh;
-+ a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
-+ b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
-+
-+ v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
-+ v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
-+ return v32x4;
-+}
-+
-+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
-+#define vuzpq_u8 vuzpq_s8
-+
-+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
-+#define vuzpq_u16 vuzpq_s16
-+
-+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
-+#define vuzpq_u32 vuzpq_s32
-+
-+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
-+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
-+{
-+ float32x4x2_t v32x4;
-+ v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
-+ v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
-+ return v32x4;
-+}
-+
-+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
-+#define vuzpq_p8 vuzpq_u8
-+
-+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
-+#define vuzpq_p16 vuzpq_u16
-+
-+//##############################################################################################
-+//*********************** Reinterpret cast intrinsics.******************************************
-+//##############################################################################################
-+// Not a part of oficial NEON instruction set but available in gcc compiler *********************
-+poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
-+#define vreinterpret_p8_u32
-+
-+poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
-+#define vreinterpret_p8_u16
-+
-+poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
-+#define vreinterpret_p8_u8
-+
-+poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
-+#define vreinterpret_p8_s32
-+
-+poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
-+#define vreinterpret_p8_s16
-+
-+poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
-+#define vreinterpret_p8_s8
-+
-+poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
-+#define vreinterpret_p8_u64
-+
-+poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
-+#define vreinterpret_p8_s64
-+
-+poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
-+#define vreinterpret_p8_f32
-+
-+poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
-+#define vreinterpret_p8_p16
-+
-+poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
-+#define vreinterpretq_p8_u32
-+
-+poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
-+#define vreinterpretq_p8_u16
-+
-+poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
-+#define vreinterpretq_p8_u8
-+
-+poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
-+#define vreinterpretq_p8_s32
-+
-+poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
-+#define vreinterpretq_p8_s16
-+
-+poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
-+#define vreinterpretq_p8_s8
-+
-+poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
-+#define vreinterpretq_p8_u64
-+
-+poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
-+#define vreinterpretq_p8_s64
-+
-+poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
-+#define vreinterpretq_p8_f32(t) _M128i(t)
-+
-+poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
-+#define vreinterpretq_p8_p16
-+
-+poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
-+#define vreinterpret_p16_u32
-+
-+poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
-+#define vreinterpret_p16_u16
-+
-+poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
-+#define vreinterpret_p16_u8
-+
-+poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
-+#define vreinterpret_p16_s32
-+
-+poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
-+#define vreinterpret_p16_s16
-+
-+poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
-+#define vreinterpret_p16_s8
-+
-+poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
-+#define vreinterpret_p16_u64
-+
-+poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
-+#define vreinterpret_p16_s64
-+
-+poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
-+#define vreinterpret_p16_f32
-+
-+poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
-+#define vreinterpret_p16_p8
-+
-+poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
-+#define vreinterpretq_p16_u32
-+
-+poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
-+#define vreinterpretq_p16_u16
-+
-+poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
-+#define vreinterpretq_p16_s32
-+
-+poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
-+#define vreinterpretq_p16_s16
-+
-+poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
-+#define vreinterpretq_p16_s8
-+
-+poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
-+#define vreinterpretq_p16_u64
-+
-+poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
-+#define vreinterpretq_p16_s64
-+
-+poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
-+#define vreinterpretq_p16_f32(t) _M128i(t)
-+
-+poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
-+#define vreinterpretq_p16_p8 vreinterpretq_s16_p8
-+
-+//**** Integer to float ******
-+float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
-+#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t))
-+
-+
-+float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
-+#define vreinterpret_f32_u16 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
-+#define vreinterpret_f32_u8 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_s32 (int32x2_t t);
-+#define vreinterpret_f32_s32 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_s16 (int16x4_t t);
-+#define vreinterpret_f32_s16 vreinterpret_f32_u32
-+
-+float32x2_t vreinterpret_f32_s8 (int8x8_t t);
-+#define vreinterpret_f32_s8 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_u64(uint64x1_t t);
-+#define vreinterpret_f32_u64 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_s64 (int64x1_t t);
-+#define vreinterpret_f32_s64 vreinterpret_f32_u32
-+
-+
-+float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
-+#define vreinterpret_f32_p16 vreinterpret_f32_u32
-+
-+float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
-+#define vreinterpret_f32_p8 vreinterpret_f32_u32
-+
-+float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
-+#define vreinterpretq_f32_u32(t) *(__m128*)&(t)
-+
-+float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
-+#define vreinterpretq_f32_u16 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
-+#define vreinterpretq_f32_u8 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
-+#define vreinterpretq_f32_s32 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
-+#define vreinterpretq_f32_s16 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
-+#define vreinterpretq_f32_s8 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
-+#define vreinterpretq_f32_u64 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
-+#define vreinterpretq_f32_s64 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
-+#define vreinterpretq_f32_p16 vreinterpretq_f32_u32
-+
-+float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
-+#define vreinterpretq_f32_p8 vreinterpretq_f32_u32
-+
-+//*** Integer type conversions ******************
-+//no conversion necessary for the following functions because it is same data type
-+int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
-+#define vreinterpret_s64_u32
-+
-+int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
-+#define vreinterpret_s64_u16
-+
-+int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
-+#define vreinterpret_s64_u8
-+
-+int64x1_t vreinterpret_s64_s32 (int32x2_t t);
-+#define vreinterpret_s64_s32
-+
-+int64x1_t vreinterpret_s64_s16 (int16x4_t t);
-+#define vreinterpret_s64_s16
-+
-+int64x1_t vreinterpret_s64_s8 (int8x8_t t);
-+#define vreinterpret_s64_s8
-+
-+int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
-+#define vreinterpret_s64_u64
-+
-+int64x1_t vreinterpret_s64_f32 (float32x2_t t);
-+#define vreinterpret_s64_f32
-+
-+int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
-+#define vreinterpret_s64_p16
-+
-+int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
-+#define vreinterpret_s64_p8
-+
-+int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
-+#define vreinterpretq_s64_u32
-+
-+int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
-+#define vreinterpretq_s64_s16
-+
-+int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
-+#define vreinterpretq_s64_u8
-+
-+int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
-+#define vreinterpretq_s64_s32
-+
-+int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
-+#define vreinterpretq_s64_u16
-+
-+int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
-+#define vreinterpretq_s64_s8
-+
-+int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
-+#define vreinterpretq_s64_u64
-+
-+int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
-+#define vreinterpretq_s64_f32(t) _M128i(t)
-+
-+int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
-+#define vreinterpretq_s64_p16
-+
-+int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
-+#define vreinterpretq_s64_p8
-+
-+uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
-+#define vreinterpret_u64_u32
-+
-+uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
-+#define vreinterpret_u64_u16
-+
-+uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
-+#define vreinterpret_u64_u8
-+
-+uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
-+#define vreinterpret_u64_s32
-+
-+uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
-+#define vreinterpret_u64_s16
-+
-+uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
-+#define vreinterpret_u64_s8
-+
-+uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
-+#define vreinterpret_u64_s64
-+
-+uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
-+#define vreinterpret_u64_f32
-+
-+uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
-+#define vreinterpret_u64_p16
-+
-+uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
-+#define vreinterpret_u64_p8
-+
-+uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
-+#define vreinterpretq_u64_u32
-+
-+uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
-+#define vreinterpretq_u64_u16
-+
-+uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
-+#define vreinterpretq_u64_u8
-+
-+uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
-+#define vreinterpretq_u64_s32
-+
-+uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
-+#define vreinterpretq_u64_s16
-+
-+uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
-+#define vreinterpretq_u64_s8
-+
-+uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
-+#define vreinterpretq_u64_s64
-+
-+uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
-+#define vreinterpretq_u64_f32(t) _M128i(t)
-+
-+uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
-+#define vreinterpretq_u64_p16
-+
-+uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
-+#define vreinterpretq_u64_p8
-+
-+int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
-+#define vreinterpret_s8_u32
-+
-+int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
-+#define vreinterpret_s8_u16
-+
-+int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
-+#define vreinterpret_s8_u8
-+
-+int8x8_t vreinterpret_s8_s32 (int32x2_t t);
-+#define vreinterpret_s8_s32
-+
-+int8x8_t vreinterpret_s8_s16 (int16x4_t t);
-+#define vreinterpret_s8_s16
-+
-+int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
-+#define vreinterpret_s8_u64
-+
-+int8x8_t vreinterpret_s8_s64 (int64x1_t t);
-+#define vreinterpret_s8_s64
-+
-+int8x8_t vreinterpret_s8_f32 (float32x2_t t);
-+#define vreinterpret_s8_f32
-+
-+int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
-+#define vreinterpret_s8_p16
-+
-+int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
-+#define vreinterpret_s8_p8
-+
-+int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
-+#define vreinterpretq_s8_u32
-+
-+int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
-+#define vreinterpretq_s8_u16
-+
-+int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
-+#define vreinterpretq_s8_u8
-+
-+int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
-+#define vreinterpretq_s8_s32
-+
-+int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
-+#define vreinterpretq_s8_s16
-+
-+int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
-+#define vreinterpretq_s8_u64
-+
-+int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
-+#define vreinterpretq_s8_s64
-+
-+int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
-+#define vreinterpretq_s8_f32(t) _M128i(t)
-+
-+int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
-+#define vreinterpretq_s8_p16
-+
-+int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
-+#define vreinterpretq_s8_p8
-+
-+int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
-+#define vreinterpret_s16_u32
-+
-+int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
-+#define vreinterpret_s16_u16
-+
-+int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
-+#define vreinterpret_s16_u8
-+
-+int16x4_t vreinterpret_s16_s32 (int32x2_t t);
-+#define vreinterpret_s16_s32
-+
-+int16x4_t vreinterpret_s16_s8 (int8x8_t t);
-+#define vreinterpret_s16_s8
-+
-+int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
-+#define vreinterpret_s16_u64
-+
-+int16x4_t vreinterpret_s16_s64 (int64x1_t t);
-+#define vreinterpret_s16_s64
-+
-+int16x4_t vreinterpret_s16_f32 (float32x2_t t);
-+#define vreinterpret_s16_f32
-+
-+
-+int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
-+#define vreinterpret_s16_p16
-+
-+int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
-+#define vreinterpret_s16_p8
-+
-+int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
-+#define vreinterpretq_s16_u32
-+
-+int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
-+#define vreinterpretq_s16_u16
-+
-+int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
-+#define vreinterpretq_s16_u8
-+
-+int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
-+#define vreinterpretq_s16_s32
-+
-+int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
-+#define vreinterpretq_s16_s8
-+
-+int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
-+#define vreinterpretq_s16_u64
-+
-+int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
-+#define vreinterpretq_s16_s64
-+
-+int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
-+#define vreinterpretq_s16_f32(t) _M128i(t)
-+
-+int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
-+#define vreinterpretq_s16_p16
-+
-+int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
-+#define vreinterpretq_s16_p8
-+
-+int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
-+#define vreinterpret_s32_u32
-+
-+int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
-+#define vreinterpret_s32_u16
-+
-+int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
-+#define vreinterpret_s32_u8
-+
-+int32x2_t vreinterpret_s32_s16 (int16x4_t t);
-+#define vreinterpret_s32_s16
-+
-+int32x2_t vreinterpret_s32_s8 (int8x8_t t);
-+#define vreinterpret_s32_s8
-+
-+int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
-+#define vreinterpret_s32_u64
-+
-+int32x2_t vreinterpret_s32_s64 (int64x1_t t);
-+#define vreinterpret_s32_s64
-+
-+int32x2_t vreinterpret_s32_f32 (float32x2_t t);
-+#define vreinterpret_s32_f32
-+
-+int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
-+#define vreinterpret_s32_p16
-+
-+int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
-+#define vreinterpret_s32_p8
-+
-+int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
-+#define vreinterpretq_s32_u32
-+
-+int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
-+#define vreinterpretq_s32_u16
-+
-+int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
-+#define vreinterpretq_s32_u8
-+
-+int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
-+#define vreinterpretq_s32_s16
-+
-+int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
-+#define vreinterpretq_s32_s8
-+
-+int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
-+#define vreinterpretq_s32_u64
-+
-+int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
-+#define vreinterpretq_s32_s64
-+
-+int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
-+#define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t))
-+
-+int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
-+#define vreinterpretq_s32_p16
-+
-+int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
-+#define vreinterpretq_s32_p8
-+
-+uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
-+#define vreinterpret_u8_u32
-+
-+uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
-+#define vreinterpret_u8_u16
-+
-+uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
-+#define vreinterpret_u8_s32
-+
-+uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
-+#define vreinterpret_u8_s16
-+
-+uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
-+#define vreinterpret_u8_s8
-+
-+uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
-+#define vreinterpret_u8_u64
-+
-+uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
-+#define vreinterpret_u8_s64
-+
-+uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
-+#define vreinterpret_u8_f32
-+
-+uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
-+#define vreinterpret_u8_p16
-+
-+uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
-+#define vreinterpret_u8_p8
-+
-+uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
-+#define vreinterpretq_u8_u32
-+
-+uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
-+#define vreinterpretq_u8_u16
-+
-+uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
-+#define vreinterpretq_u8_s32
-+
-+uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
-+#define vreinterpretq_u8_s16
-+
-+uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
-+#define vreinterpretq_u8_s8
-+
-+uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
-+#define vreinterpretq_u8_u64
-+
-+uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
-+#define vreinterpretq_u8_s64
-+
-+uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
-+#define vreinterpretq_u8_f32(t) _M128i(t)
-+
-+
-+uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
-+#define vreinterpretq_u8_p16
-+
-+uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
-+#define vreinterpretq_u8_p8
-+
-+uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
-+#define vreinterpret_u16_u32
-+
-+uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
-+#define vreinterpret_u16_u8
-+
-+uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
-+#define vreinterpret_u16_s32
-+
-+uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
-+#define vreinterpret_u16_s16
-+
-+uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
-+#define vreinterpret_u16_s8
-+
-+uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
-+#define vreinterpret_u16_u64
-+
-+uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
-+#define vreinterpret_u16_s64
-+
-+uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
-+#define vreinterpret_u16_f32
-+
-+uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
-+#define vreinterpret_u16_p16
-+
-+uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
-+#define vreinterpret_u16_p8
-+
-+uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
-+#define vreinterpretq_u16_u32
-+
-+uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
-+#define vreinterpretq_u16_u8
-+
-+uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
-+#define vreinterpretq_u16_s32
-+
-+uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
-+#define vreinterpretq_u16_s16
-+
-+uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
-+#define vreinterpretq_u16_s8
-+
-+uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
-+#define vreinterpretq_u16_u64
-+
-+uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
-+#define vreinterpretq_u16_s64
-+
-+uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
-+#define vreinterpretq_u16_f32(t) _M128i(t)
-+
-+uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
-+#define vreinterpretq_u16_p16
-+
-+uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
-+#define vreinterpretq_u16_p8
-+
-+uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
-+#define vreinterpret_u32_u16
-+
-+uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
-+#define vreinterpret_u32_u8
-+
-+uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
-+#define vreinterpret_u32_s32
-+
-+uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
-+#define vreinterpret_u32_s16
-+
-+uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
-+#define vreinterpret_u32_s8
-+
-+uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
-+#define vreinterpret_u32_u64
-+
-+uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
-+#define vreinterpret_u32_s64
-+
-+uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
-+#define vreinterpret_u32_f32
-+
-+uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
-+#define vreinterpret_u32_p16
-+
-+uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
-+#define vreinterpret_u32_p8
-+
-+uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
-+#define vreinterpretq_u32_u16
-+
-+uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
-+#define vreinterpretq_u32_u8
-+
-+uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
-+#define vreinterpretq_u32_s32
-+
-+uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
-+#define vreinterpretq_u32_s16
-+
-+uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
-+#define vreinterpretq_u32_s8
-+
-+uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
-+#define vreinterpretq_u32_u64
-+
-+uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
-+#define vreinterpretq_u32_s64
-+
-+uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
-+#define vreinterpretq_u32_f32(t) _M128i(t)
-+
-+uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
-+#define vreinterpretq_u32_p16
-+
-+uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
-+#define vreinterpretq_u32_p8
-+
-+#endif /* NEON2SSE_H */
-diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h
-index fee33a3..22fb2ce 100644
---- a/gcc/config/i386/gnu-user.h
-+++ b/gcc/config/i386/gnu-user.h
-@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see
- When the -shared link option is used a final link is not being
- done. */
-
-+#undef ANDROID_TARGET_CC1_SPEC
-+#define ANDROID_TARGET_CC1_SPEC \
-+ " -mssse3 -fno-short-enums " \
-+
- #undef ASM_SPEC
- #define ASM_SPEC \
-- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
-+ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
-
- #undef SUBTARGET_EXTRA_SPECS
- #define SUBTARGET_EXTRA_SPECS \
-diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h
-index 7a02a7e..cac4179 100644
---- a/gcc/config/i386/gnu-user64.h
-+++ b/gcc/config/i386/gnu-user64.h
-@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define SPEC_X32 "mx32"
- #endif
-
-+#undef ANDROID_TARGET_CC1_SPEC
-+#define ANDROID_TARGET_CC1_SPEC \
-+ "%{m32:-mssse3 -fno-short-enums}" \
-+ "%{!m32:-msse4.2 -mpopcnt}"
-+
- #undef ASM_SPEC
- #define ASM_SPEC "%{" SPEC_32 ":--32} \
- %{" SPEC_64 ":--64} \
-diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
-index 3d044e8..5c89fca 100644
---- a/gcc/config/i386/i386.c
-+++ b/gcc/config/i386/i386.c
-@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp)
- else if (!SYMBOL_REF_FAR_ADDR_P (op0)
- && (SYMBOL_REF_LOCAL_P (op0)
- || (HAVE_LD_PIE_COPYRELOC
-+ && !TARGET_HAS_BIONIC
- && flag_pie
- && !SYMBOL_REF_WEAK (op0)
- && !SYMBOL_REF_FUNCTION_P (op0)))
-diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h
-index 4b9910f..c33e074 100644
---- a/gcc/config/i386/linux-common.h
-+++ b/gcc/config/i386/linux-common.h
-@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see
- #undef CC1_SPEC
- #define CC1_SPEC \
- LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
-- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
-+ GNU_USER_TARGET_CC1_SPEC \
-+ ANDROID_TARGET_CC1_SPEC \
-+ " " \
-+ ANDROID_CC1_SPEC("-fPIC"))
-+
-+#define CC1PLUS_SPEC \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
-
- #undef LINK_SPEC
- #define LINK_SPEC \
-diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h
-index a1f98d3..3725799 100644
---- a/gcc/config/i386/pmm_malloc.h
-+++ b/gcc/config/i386/pmm_malloc.h
-@@ -31,7 +31,7 @@
- #ifndef __cplusplus
- extern int posix_memalign (void **, size_t, size_t);
- #else
--extern "C" int posix_memalign (void **, size_t, size_t) throw ();
-+extern "C" int posix_memalign (void **, size_t, size_t);
- #endif
-
- static __inline void *
-diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h
-index 301a41c..bebfe7f 100644
---- a/gcc/config/linux-android.h
-+++ b/gcc/config/linux-android.h
-@@ -38,15 +38,18 @@
- "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}"
-
- #define ANDROID_LINK_SPEC \
-- "%{shared: -Bsymbolic}"
-+ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now"
-
--#define ANDROID_CC1_SPEC \
-+#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \
- "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \
-- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}"
-+ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}"
-
- #define ANDROID_CC1PLUS_SPEC \
-- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \
-- "%{!frtti:%{!fno-rtti: -fno-rtti}}"
-+ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \
-+ "%{!frtti:%{!fno-rtti: -frtti}}"
-+
-+#define ANDROID_ASM_SPEC \
-+ "--noexecstack"
-
- #define ANDROID_LIB_SPEC \
- "%{!static: -ldl}"
-diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h
-new file mode 100644
-index 0000000..32c539c
---- /dev/null
-+++ b/gcc/config/mips/android.h
-@@ -0,0 +1,49 @@
-+/* Target macros for mips*-*android* targets.
-+ Copyright (C) 2014 Free Software Foundation, Inc.
-+
-+This file is part of GCC.
-+
-+GCC is free software; you can redistribute it and/or modify
-+it under the terms of the GNU General Public License as published by
-+the Free Software Foundation; either version 3, or (at your option)
-+any later version.
-+
-+GCC is distributed in the hope that it will be useful,
-+but WITHOUT ANY WARRANTY; without even the implied warranty of
-+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+GNU General Public License for more details.
-+
-+You should have received a copy of the GNU General Public License
-+along with GCC; see the file COPYING3. If not see
-+<http://www.gnu.org/licenses/>. */
-+
-+#undef DRIVER_SELF_SPECS
-+#define DRIVER_SELF_SPECS \
-+ /* Make sure a -mips option is present. This helps us to pick \
-+ the right multilib, and also makes the later specs easier \
-+ to write. */ \
-+ MIPS_ISA_LEVEL_SPEC, \
-+ \
-+ /* Infer the default float setting from -march. */ \
-+ MIPS_ARCH_FLOAT_SPEC, \
-+ \
-+ /* Infer the -msynci setting from -march if not explicitly set. */ \
-+ MIPS_ISA_SYNCI_SPEC, \
-+ \
-+ /* If no ABI option is specified, infer one from the ISA level \
-+ or -mgp setting. */ \
-+ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \
-+ \
-+ /* If no FP ABI option is specified, infer one from the \
-+ ABI/ISA level unless there is a conflicting option. */ \
-+ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \
-+ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \
-+ \
-+ /* If no odd-spreg option is specified, infer one from the ISA. */ \
-+ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \
-+ \
-+ /* Base SPECs. */ \
-+ BASE_DRIVER_SELF_SPECS, \
-+ \
-+ /* Use the standard linux specs for everything else. */ \
-+ LINUX_DRIVER_SELF_SPECS
-diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h
-index 15b549c..4a28160 100644
---- a/gcc/config/mips/gnu-user.h
-+++ b/gcc/config/mips/gnu-user.h
-@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see
- /* The GNU C++ standard library requires this. */ \
- if (c_dialect_cxx ()) \
- builtin_define ("_GNU_SOURCE"); \
-+ ANDROID_TARGET_OS_CPP_BUILTINS(); \
- } while (0)
-
- #undef SUBTARGET_CPP_SPEC
-@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see
-
- #undef SUBTARGET_ASM_SPEC
- #define SUBTARGET_ASM_SPEC \
-- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}"
-+ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \
-+ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
-
- /* The MIPS assembler has different syntax for .set. We set it to
- .dummy to trap any errors. */
-@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
- #endif
-
- #define LINUX_DRIVER_SELF_SPECS \
-- NO_SHARED_SPECS \
-+ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \
- MARCH_MTUNE_NATIVE_SPECS, \
- /* -mplt has no effect without -mno-shared. Simplify later \
- specs handling by removing a redundant option. */ \
-diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
-index 8429a7c..4b0825d 100644
---- a/gcc/config/mips/linux-common.h
-+++ b/gcc/config/mips/linux-common.h
-@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see
- #undef SUBTARGET_CC1_SPEC
- #define SUBTARGET_CC1_SPEC \
- LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
-- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
-+ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
-
- #undef CC1PLUS_SPEC
- #define CC1PLUS_SPEC \
-@@ -62,3 +62,7 @@ along with GCC; see the file COPYING3. If not see
-
- /* The default value isn't sufficient in 64-bit mode. */
- #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024)
-+
-+#ifdef IN_LIBGCC2
-+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
-+#endif
-diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android
-new file mode 100644
-index 0000000..39f512c
---- /dev/null
-+++ b/gcc/config/mips/t-linux-android
-@@ -0,0 +1,3 @@
-+MULTILIB_OPTIONS = mips32r2/mips32r6
-+MULTILIB_DIRNAMES = mips-r2 mips-r6
-+MULTILIB_OSDIRNAMES = ../libr2 ../libr6
-diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64
-new file mode 100644
-index 0000000..55cab7d
---- /dev/null
-+++ b/gcc/config/mips/t-linux-android64
-@@ -0,0 +1,4 @@
-+MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6
-+MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6
-+MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64
-+MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6
-diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h
-index 37ecfc4..a5f1b99 100644
---- a/gcc/config/openbsd.h
-+++ b/gcc/config/openbsd.h
-@@ -136,8 +136,12 @@ while (0)
- #define LIB_SPEC OBSD_LIB_SPEC
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
- #endif
-+#endif
-
- #undef LIB_SPEC
- #define LIB_SPEC OBSD_LIB_SPEC
-diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h
-index cbf9097..eb2217f 100644
---- a/gcc/config/rs6000/sysv4.h
-+++ b/gcc/config/rs6000/sysv4.h
-@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
- -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}"
-
- #if defined(HAVE_LD_EH_FRAME_HDR)
--# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
-+# ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+# define LINK_EH_SPEC "--eh-frame-hdr "
-+# else
-+# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
-+# endif
- #endif
-
- #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \
-diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
-index 5160e1f..7632a50 100644
---- a/gcc/config/sol2.h
-+++ b/gcc/config/sol2.h
-@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see
- /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs
- --eh-frame-hdr to create the required .eh_frame_hdr sections. */
- #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR)
-+#ifdef USE_EH_FRAME_HDR_FOR_STATIC
-+#define LINK_EH_SPEC "--eh-frame-hdr "
-+#else
- #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
-+#endif
- #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */
- #endif
-
-diff --git a/gcc/configure b/gcc/configure
-index 1c6e340..28ad050 100755
---- a/gcc/configure
-+++ b/gcc/configure
-@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769
- enable_fix_cortex_a53_843419
- with_glibc_version
- enable_gnu_unique_object
-+enable_eh_frame_hdr_for_static
- enable_linker_build_id
- enable_default_ssp
- with_long_double_128
-@@ -1670,6 +1671,9 @@ Optional Features:
- --enable-gnu-unique-object
- enable the use of the @gnu_unique_object ELF
- extension on glibc systems
-+ --enable-eh-frame-hdr-for-static
-+ enable linker PT_GNU_EH_FRAME support for static
-+ executable
- --enable-linker-build-id
- compiler will always pass --build-id to linker
- --enable-default-ssp enable Stack Smashing Protection as default
-@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
-
- $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h
-
-+ # Check whether --enable-eh-frame-hdr-for-static was given.
-+if test "${enable_eh_frame_hdr_for_static+set}" = set; then :
-+ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in
-+ yes | no) ;;
-+ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid
-+value for --enable-eh-frame-hdr-for-static.
-+Valid choices are 'yes' and 'no'." "$LINENO" 5 ;;
-+ esac
-+else
-+ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
-+# Linux kernel.
-+ if test x$host = x$build -a x$host = x$target &&
-+ ldd --version 2>&1 >/dev/null &&
-+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
-+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
-+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
-+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
-+ if test "$glibcnum" -ge 2003 ; then
-+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
-+ if echo "$auvx" | grep AT_PHDR > /dev/null &&
-+ echo "$auvx" | grep AT_PHNUM > /dev/null; then
-+ enable_eh_frame_hdr_for_static=yes
-+ fi
-+ fi
-+ fi
-+fi
-+
-+ if test x$enable_eh_frame_hdr_for_static = xyes; then
-+
-+$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h
-+
-+ fi
- fi
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5
- $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; }
-diff --git a/gcc/configure.ac b/gcc/configure.ac
-index 6c1dcd9..0cf7419 100644
---- a/gcc/configure.ac
-+++ b/gcc/configure.ac
-@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR])
- if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
- AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1,
- [Define if your linker supports .eh_frame_hdr.])
-+ AC_ARG_ENABLE(eh-frame-hdr-for-static,
-+ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static],
-+ [enable linker PT_GNU_EH_FRAME support for static executable])],
-+ [case $enable_eh_frame_hdr_for_static in
-+ yes | no) ;;
-+ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid
-+value for --enable-eh-frame-hdr-for-static.
-+Valid choices are 'yes' and 'no'.]) ;;
-+ esac],
-+# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
-+# Linux kernel.
-+ [[if test x$host = x$build -a x$host = x$target &&
-+ ldd --version 2>&1 >/dev/null &&
-+ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
-+ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
-+ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
-+ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
-+ if test "$glibcnum" -ge 2003 ; then
-+ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
-+ if echo "$auvx" | grep AT_PHDR > /dev/null &&
-+ echo "$auvx" | grep AT_PHNUM > /dev/null; then
-+ enable_eh_frame_hdr_for_static=yes
-+ fi
-+ fi
-+ fi]])
-+ if test x$enable_eh_frame_hdr_for_static = xyes; then
-+ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1,
-+[Define if your system supports PT_GNU_EH_FRAME for static executable.])
-+ fi
- fi
- AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr)
-
-diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C
-new file mode 100644
-index 0000000..15408ef
---- /dev/null
-+++ b/gcc/testsuite/g++.dg/eh/spec3-static.C
-@@ -0,0 +1,25 @@
-+// PR c++/4381
-+// Test that exception-specs work properly for classes with virtual bases.
-+
-+// { dg-do run }
-+// { dg-options "-static" }
-+
-+class Base {};
-+
-+struct A : virtual public Base
-+{
-+ A() {}
-+};
-+
-+struct B {};
-+
-+void func() throw (B,A)
-+{
-+ throw A();
-+}
-+
-+int main(void)
-+{
-+ try { func(); }
-+ catch (A& a) { }
-+}
-diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c
-index f3343fc..d426477 100644
---- a/libgcc/crtstuff.c
-+++ b/libgcc/crtstuff.c
-@@ -88,7 +88,8 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) \
-- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
-+ && !defined(inhibit_libc) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(BSD_DL_ITERATE_PHDR_AVAILABLE)
- #include <link.h>
- # define USE_PT_GNU_EH_FRAME
-@@ -97,7 +98,8 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \
-- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
-+ && !defined(inhibit_libc) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(__sun__) && defined(__svr4__)
- #include <link.h>
- # define USE_PT_GNU_EH_FRAME
-@@ -106,7 +108,8 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) \
-- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
-+ && !defined(inhibit_libc) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(__GLIBC__) && __GLIBC__ >= 2
- #include <link.h>
- /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
-@@ -121,7 +124,7 @@ call_ ## FUNC (void) \
- #if defined(OBJECT_FORMAT_ELF) \
- && !defined(OBJECT_FORMAT_FLAT) \
- && defined(HAVE_LD_EH_FRAME_HDR) \
-- && !defined(CRTSTUFFT_O) \
-+ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
- && defined(inhibit_libc) \
- && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__))
- /* On systems using glibc, an inhibit_libc build of libgcc is only
-diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
-index 555c0fe..47c8655 100644
---- a/libgcc/gthr-posix.h
-+++ b/libgcc/gthr-posix.h
-@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define __GTHREADS 1
- #define __GTHREADS_CXX0X 1
-
-+/* The following should normally be in a different header file,
-+ * but I couldn't find the right location. The point of the macro
-+ * definition below is to prevent libsupc++ and libstdc++ to reference
-+ * weak symbols in their static C++ constructors. Such code crashes
-+ * when a shared object linked statically to these libraries is
-+ * loaded on Android 2.1 (Eclair) and older platform releases, due
-+ * to a dynamic linker bug.
-+ */
-+#ifdef __ANDROID__
-+#undef GTHREAD_USE_WEAK
-+#define GTHREAD_USE_WEAK 0
-+#endif
-+
- #include <pthread.h>
-
- #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \
-diff --git a/libgcc/gthr.h b/libgcc/gthr.h
-index 47a7d06..67a680f 100644
---- a/libgcc/gthr.h
-+++ b/libgcc/gthr.h
-@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- #define GTHREAD_USE_WEAK 1
- #endif
- #endif
-+#if __ANDROID__
-+#include "gthr-posix.h"
-+#else
- #include "gthr-default.h"
-+#endif
-
- #ifndef HIDE_EXPORTS
- #pragma GCC visibility pop
-diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
-index 41797a9..5a631dd 100755
---- a/libstdc++-v3/configure
-+++ b/libstdc++-v3/configure
-@@ -78319,6 +78319,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
- /* end confdefs.h. */
- #include <sys/syscall.h>
- int lk;
-+#if !defined(SYS_gettid)
-+#define SYS_gettid __NR_gettid
-+#endif
-+#if !defined(SYS_futex)
-+#define SYS_futex __NR_futex
-+#endif
- int
- main ()
- {
-@@ -78377,6 +78383,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
- /* end confdefs.h. */
- #include <sys/syscall.h>
- int lk;
-+#if !defined(SYS_gettid)
-+#define SYS_gettid __NR_gettid
-+#endif
-+#if !defined(SYS_futex)
-+#define SYS_futex __NR_futex
-+#endif
- int
- main ()
- {
-diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h
-index e3e206b..e85dc2c 100644
---- a/libstdc++-v3/include/bits/locale_facets.h
-+++ b/libstdc++-v3/include/bits/locale_facets.h
-@@ -47,6 +47,20 @@
- #include <ext/numeric_traits.h>
- #include <bits/streambuf_iterator.h>
-
-+#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__
-+// CrystaX: for some reason, x86 gcc-4.9 makes ctype<char>::do_widen() and
-+// ctype<char>::_M_widen_init() methods working wrong if optimization enabled.
-+// For ctype<char>::do_widen(), values of passed arguments (__lo, __hi and __to)
-+// are completely messed up and don't correspond to passed values. In case if
-+// we disable optimization for those methods, things become correct so we apply
-+// this workaround here for a time.
-+// TODO: figure out what exactly wrong here - is it bug in GCC optimization
-+// algorithm or smth else?
-+#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0)))
-+#else
-+#define __CRYSTAX_X86_DONT_OPTIMIZE
-+#endif
-+
- namespace std _GLIBCXX_VISIBILITY(default)
- {
- _GLIBCXX_BEGIN_NAMESPACE_VERSION
-@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
- * @return @a __hi.
- */
- virtual const char*
-- do_widen(const char* __lo, const char* __hi, char_type* __to) const
-+ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE
- {
- __builtin_memcpy(__to, __lo, __hi - __lo);
- return __hi;
-@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
-
- private:
- void _M_narrow_init() const;
-- void _M_widen_init() const;
-+ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE;
- };
-
- #ifdef _GLIBCXX_USE_WCHAR_T
-diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc
-index 9b61799..c149169 100644
---- a/libstdc++-v3/libsupc++/guard.cc
-+++ b/libstdc++-v3/libsupc++/guard.cc
-@@ -33,7 +33,12 @@
- #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \
- && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX)
- # include <climits>
-+#if defined(__ANDROID__)
-+# include <sys/syscall.h>
-+# define SYS_futex __NR_futex
-+#else
- # include <syscall.h>
-+#endif
- # include <unistd.h>
- # define _GLIBCXX_USE_FUTEX
- # define _GLIBCXX_FUTEX_WAIT 0
diff --git a/patches/gcc/linaro-6.3-2017.02/970-crystax.patch b/patches/gcc/linaro-6.3-2017.02/970-crystax.patch
new file mode 100644
index 0000000..e3109cc
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/970-crystax.patch
@@ -0,0 +1,551 @@
+commit 080803512c8f6f87c2f1f711170d54033144d628
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Wed Jul 29 11:28:29 2015 +0300
+
+ [android] Apply Android-related modifications
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+[Edited: keep libstdc++, drop libcrystax-related modifications]
+diff --git a/gcc/config.gcc b/gcc/config.gcc
+index f66e48cd1..1c253496b 100644
+--- a/gcc/config.gcc
++++ b/gcc/config.gcc
+@@ -942,13 +942,17 @@ aarch64*-*-elf | aarch64*-*-rtems*)
+ TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
+ ;;
+ aarch64*-*-linux*)
+- tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
++ tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h"
+ tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
++ extra_options="${extra_options} linux-android.opt"
+ tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
+ case $target in
+ aarch64_be-*)
+ tm_defines="${tm_defines} TARGET_BIG_ENDIAN_DEFAULT=1"
+ ;;
++ aarch64*-*-linux-android*)
++ tm_file="${tm_file} aarch64/aarch64-linux-android.h"
++ ;;
+ esac
+ aarch64_multilibs="${with_multilib_list}"
+ if test "$aarch64_multilibs" = "default"; then
+@@ -2055,6 +2059,17 @@ mips*-*-linux*) # Linux MIPS, either endian.
+ tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h"
+ extra_options="${extra_options} linux-android.opt"
+ case ${target} in
++ mips64*android*)
++ default_mips_arch=mips64r6
++ default_mips_abi=64
++ tm_file="${tm_file} mips/android.h"
++ tmake_file="${tmake_file} mips/t-linux-android64"
++ ;;
++ mips*android*)
++ default_mips_arch=mips32
++ tm_file="${tm_file} mips/android.h"
++ tmake_file="$tmake_file mips/t-linux-android"
++ ;;
+ mipsisa32r6*)
+ default_mips_arch=mips32r6
+ ;;
+diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
+new file mode 100644
+index 000000000..db1288fd0
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-linux-android.h
+@@ -0,0 +1,59 @@
++/* Machine description for AArch64 architecture.
++ Copyright (C) 2014 Free Software Foundation, Inc.
++
++ This file is part of GCC.
++
++ GCC is free software; you can redistribute it and/or modify it
++ under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 3, or (at your option)
++ any later version.
++
++ GCC is distributed in the hope that it will be useful, but
++ WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with GCC; see the file COPYING3. If not see
++ <http://www.gnu.org/licenses/>. */
++
++#ifndef GCC_AARCH64_LINUX_ANDROID_H
++#define GCC_AARCH64_LINUX_ANDROID_H
++
++
++#undef TARGET_OS_CPP_BUILTINS
++#define TARGET_OS_CPP_BUILTINS() \
++ do \
++ { \
++ GNU_USER_TARGET_OS_CPP_BUILTINS(); \
++ ANDROID_TARGET_OS_CPP_BUILTINS(); \
++ } \
++ while (0)
++
++#undef LINK_SPEC
++#define LINK_SPEC \
++ LINUX_OR_ANDROID_LD (LINUX_TARGET_LINK_SPEC, \
++ LINUX_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC)
++
++#undef CC1_SPEC
++#define CC1_SPEC \
++ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
++
++#define CC1PLUS_SPEC \
++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
++
++#undef LIB_SPEC
++#define LIB_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
++ GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC)
++
++#undef STARTFILE_SPEC
++#define STARTFILE_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC)
++
++#undef ENDFILE_SPEC
++#define ENDFILE_SPEC \
++ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
++
++#endif /* GCC_AARCH64_LINUX_ANDROID_H */
+diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
+index 5fcaa59a3..6864195ee 100644
+--- a/gcc/config/aarch64/aarch64-linux.h
++++ b/gcc/config/aarch64/aarch64-linux.h
+@@ -21,7 +21,14 @@
+ #ifndef GCC_AARCH64_LINUX_H
+ #define GCC_AARCH64_LINUX_H
+
+-#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
++#ifndef RUNTIME_ROOT_PREFIX
++#define RUNTIME_ROOT_PREFIX ""
++#endif
++#define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
++#ifdef BIONIC_DYNAMIC_LINKER
++#undef BIONIC_DYNAMIC_LINKER
++#endif
++#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64"
+
+ #undef MUSL_DYNAMIC_LINKER
+ #define MUSL_DYNAMIC_LINKER "/lib/ld-musl-aarch64%{mbig-endian:_be}%{mabi=ilp32:_ilp32}.so.1"
+diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
+index ad123dde9..97b059de6 100644
+--- a/gcc/config/arm/arm.h
++++ b/gcc/config/arm/arm.h
+@@ -1888,10 +1888,11 @@ enum arm_auto_incmodes
+
+ #define CASE_VECTOR_PC_RELATIVE (TARGET_THUMB2 \
+ || (TARGET_THUMB1 \
++ && !inline_thumb1_jump_table \
+ && (optimize_size || flag_pic)))
+
+ #define CASE_VECTOR_SHORTEN_MODE(min, max, body) \
+- (TARGET_THUMB1 \
++ (TARGET_THUMB1 && !inline_thumb1_jump_table \
+ ? (min >= 0 && max < 512 \
+ ? (ADDR_DIFF_VEC_FLAGS (body).offset_unsigned = 1, QImode) \
+ : min >= -256 && max < 256 \
+diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
+index 47171b996..eb22d1181 100644
+--- a/gcc/config/arm/arm.md
++++ b/gcc/config/arm/arm.md
+@@ -8179,7 +8179,7 @@
+ (match_operand:SI 2 "const_int_operand" "") ; total range
+ (match_operand:SI 3 "" "") ; table label
+ (match_operand:SI 4 "" "")] ; Out of range label
+- "TARGET_32BIT || optimize_size || flag_pic"
++ "TARGET_32BIT || ((optimize_size || flag_pic) && !inline_thumb1_jump_table)"
+ "
+ {
+ enum insn_code code;
+diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
+index 0ebe01743..772453889 100644
+--- a/gcc/config/arm/arm.opt
++++ b/gcc/config/arm/arm.opt
+@@ -193,6 +193,10 @@ mthumb-interwork
+ Target Report Mask(INTERWORK)
+ Support calls between Thumb and ARM instruction sets.
+
++minline-thumb1-jumptable
++Target Report Var(inline_thumb1_jump_table)
++Inline Thumb1 Jump table code
++
+ mtls-dialect=
+ Target RejectNegative Joined Enum(tls_type) Var(target_tls_dialect) Init(TLS_GNU)
+ Specify thread local storage scheme.
+diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h
+index 77f30554d..32158ed65 100644
+--- a/gcc/config/arm/elf.h
++++ b/gcc/config/arm/elf.h
+@@ -56,8 +56,7 @@
+ #undef SUBSUBTARGET_EXTRA_SPECS
+ #define SUBSUBTARGET_EXTRA_SPECS
+
+-#ifndef ASM_SPEC
+-#define ASM_SPEC "\
++#define DEFAULT_ASM_SPEC "\
+ %{mbig-endian:-EB} \
+ %{mlittle-endian:-EL} \
+ %(asm_cpu_spec) \
+@@ -66,6 +65,9 @@
+ %{mthumb-interwork:-mthumb-interwork} \
+ %{mfloat-abi=*} %{mfpu=*} \
+ %(subtarget_extra_asm_spec)"
++
++#ifndef ASM_SPEC
++#define ASM_SPEC DEFAULT_ASM_SPEC
+ #endif
+
+ /* The ARM uses @ are a comment character so we need to redefine
+@@ -104,7 +106,8 @@
+ the code more efficient, but for Thumb-1 it's better to put them out of
+ band unless we are generating compressed tables. */
+ #define JUMP_TABLES_IN_TEXT_SECTION \
+- (TARGET_32BIT || (TARGET_THUMB && (optimize_size || flag_pic)))
++ (TARGET_32BIT || (TARGET_THUMB && !inline_thumb1_jump_table \
++ && (optimize_size || flag_pic)))
+
+ #ifndef LINK_SPEC
+ #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X"
+diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h
+index ace84816e..8c8fa6553 100644
+--- a/gcc/config/arm/linux-eabi.h
++++ b/gcc/config/arm/linux-eabi.h
+@@ -108,11 +108,16 @@
+ #define CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC, \
+ GNU_USER_TARGET_CC1_SPEC " " ASAN_CC1_SPEC " " \
+- ANDROID_CC1_SPEC)
++ ANDROID_CC1_SPEC("-fpic"))
+
+ #define CC1PLUS_SPEC \
+ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
+
++#undef ASM_SPEC
++#define ASM_SPEC \
++ LINUX_OR_ANDROID_CC (DEFAULT_ASM_SPEC, \
++ DEFAULT_ASM_SPEC " " ANDROID_ASM_SPEC)
++
+ #undef LIB_SPEC
+ #define LIB_SPEC \
+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
+diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h
+index fee33a3ef..22fb2ced9 100644
+--- a/gcc/config/i386/gnu-user.h
++++ b/gcc/config/i386/gnu-user.h
+@@ -65,9 +65,14 @@ along with GCC; see the file COPYING3. If not see
+ When the -shared link option is used a final link is not being
+ done. */
+
++#undef ANDROID_TARGET_CC1_SPEC
++#define ANDROID_TARGET_CC1_SPEC \
++ " -mssse3 -fno-short-enums " \
++
+ #undef ASM_SPEC
+ #define ASM_SPEC \
+- "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
++ "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \
++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
+
+ #undef SUBTARGET_EXTRA_SPECS
+ #define SUBTARGET_EXTRA_SPECS \
+diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h
+index 7a02a7eb4..cac4179bc 100644
+--- a/gcc/config/i386/gnu-user64.h
++++ b/gcc/config/i386/gnu-user64.h
+@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define SPEC_X32 "mx32"
+ #endif
+
++#undef ANDROID_TARGET_CC1_SPEC
++#define ANDROID_TARGET_CC1_SPEC \
++ "%{m32:-mssse3 -fno-short-enums}" \
++ "%{!m32:-msse4.2 -mpopcnt}"
++
+ #undef ASM_SPEC
+ #define ASM_SPEC "%{" SPEC_32 ":--32} \
+ %{" SPEC_64 ":--64} \
+diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h
+index 4b9910fa9..3b11ed086 100644
+--- a/gcc/config/i386/linux-common.h
++++ b/gcc/config/i386/linux-common.h
+@@ -30,7 +30,13 @@ along with GCC; see the file COPYING3. If not see
+ #undef CC1_SPEC
+ #define CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
+- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
++ GNU_USER_TARGET_CC1_SPEC \
++ ANDROID_TARGET_CC1_SPEC \
++ " " \
++ ANDROID_CC1_SPEC("-fPIC"))
++
++#define CC1PLUS_SPEC \
++ LINUX_OR_ANDROID_CC ("", ANDROID_CC1PLUS_SPEC)
+
+ #undef LINK_SPEC
+ #define LINK_SPEC \
+diff --git a/gcc/config/linux-android.h b/gcc/config/linux-android.h
+index 301a41ccd..9623c88d0 100644
+--- a/gcc/config/linux-android.h
++++ b/gcc/config/linux-android.h
+@@ -38,15 +39,18 @@
+ "%{" NOANDROID "|tno-android-ld:" LINUX_SPEC ";:" ANDROID_SPEC "}"
+
+ #define ANDROID_LINK_SPEC \
+- "%{shared: -Bsymbolic}"
++ "%{shared: -Bsymbolic} -z noexecstack -z relro -z now"
+
+-#define ANDROID_CC1_SPEC \
++#define ANDROID_CC1_SPEC(ANDROID_PIC_DEFAULT) \
+ "%{!mglibc:%{!muclibc:%{!mbionic: -mbionic}}} " \
+- "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}"
++ "%{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: " ANDROID_PIC_DEFAULT "}}}}"
+
+ #define ANDROID_CC1PLUS_SPEC \
+- "%{!fexceptions:%{!fno-exceptions: -fno-exceptions}} " \
+- "%{!frtti:%{!fno-rtti: -fno-rtti}}"
++ "%{!fexceptions:%{!fno-exceptions: -fexceptions}} " \
++ "%{!frtti:%{!fno-rtti: -frtti}}"
++
++#define ANDROID_ASM_SPEC \
++ "--noexecstack"
+
+ #define ANDROID_LIB_SPEC \
+ "%{!static: -ldl}"
+diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h
+new file mode 100644
+index 000000000..32c539c8d
+--- /dev/null
++++ b/gcc/config/mips/android.h
+@@ -0,0 +1,49 @@
++/* Target macros for mips*-*android* targets.
++ Copyright (C) 2014 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3. If not see
++<http://www.gnu.org/licenses/>. */
++
++#undef DRIVER_SELF_SPECS
++#define DRIVER_SELF_SPECS \
++ /* Make sure a -mips option is present. This helps us to pick \
++ the right multilib, and also makes the later specs easier \
++ to write. */ \
++ MIPS_ISA_LEVEL_SPEC, \
++ \
++ /* Infer the default float setting from -march. */ \
++ MIPS_ARCH_FLOAT_SPEC, \
++ \
++ /* Infer the -msynci setting from -march if not explicitly set. */ \
++ MIPS_ISA_SYNCI_SPEC, \
++ \
++ /* If no ABI option is specified, infer one from the ISA level \
++ or -mgp setting. */ \
++ "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}", \
++ \
++ /* If no FP ABI option is specified, infer one from the \
++ ABI/ISA level unless there is a conflicting option. */ \
++ "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{" \
++ MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}", \
++ \
++ /* If no odd-spreg option is specified, infer one from the ISA. */ \
++ "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}", \
++ \
++ /* Base SPECs. */ \
++ BASE_DRIVER_SELF_SPECS, \
++ \
++ /* Use the standard linux specs for everything else. */ \
++ LINUX_DRIVER_SELF_SPECS
+diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h
+index 15b549c08..4a2816014 100644
+--- a/gcc/config/mips/gnu-user.h
++++ b/gcc/config/mips/gnu-user.h
+@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3. If not see
+ /* The GNU C++ standard library requires this. */ \
+ if (c_dialect_cxx ()) \
+ builtin_define ("_GNU_SOURCE"); \
++ ANDROID_TARGET_OS_CPP_BUILTINS(); \
+ } while (0)
+
+ #undef SUBTARGET_CPP_SPEC
+@@ -71,7 +72,8 @@ along with GCC; see the file COPYING3. If not see
+
+ #undef SUBTARGET_ASM_SPEC
+ #define SUBTARGET_ASM_SPEC \
+- "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}}"
++ "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \
++ LINUX_OR_ANDROID_CC ("", ANDROID_ASM_SPEC)
+
+ /* The MIPS assembler has different syntax for .set. We set it to
+ .dummy to trap any errors. */
+@@ -120,7 +122,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
+ #endif
+
+ #define LINUX_DRIVER_SELF_SPECS \
+- NO_SHARED_SPECS \
++ LINUX_OR_ANDROID_CC(NO_SHARED_SPECS, "") \
+ MARCH_MTUNE_NATIVE_SPECS, \
+ /* -mplt has no effect without -mno-shared. Simplify later \
+ specs handling by removing a redundant option. */ \
+diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
+index 8429a7ca2..8bfacf994 100644
+--- a/gcc/config/mips/linux-common.h
++++ b/gcc/config/mips/linux-common.h
+@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see
+ #undef SUBTARGET_CC1_SPEC
+ #define SUBTARGET_CC1_SPEC \
+ LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
+- GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
++ GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC("-fpic"))
+
+ #undef CC1PLUS_SPEC
+ #define CC1PLUS_SPEC \
+diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android
+new file mode 100644
+index 000000000..39f512c81
+--- /dev/null
++++ b/gcc/config/mips/t-linux-android
+@@ -0,0 +1,3 @@
++MULTILIB_OPTIONS = mips32r2/mips32r6
++MULTILIB_DIRNAMES = mips-r2 mips-r6
++MULTILIB_OSDIRNAMES = ../libr2 ../libr6
+diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64
+new file mode 100644
+index 000000000..55cab7d62
+--- /dev/null
++++ b/gcc/config/mips/t-linux-android64
+@@ -0,0 +1,4 @@
++MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6
++MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6
++MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64
++MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6
+diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
+index 555c0fe24..47c8655f9 100644
+--- a/libgcc/gthr-posix.h
++++ b/libgcc/gthr-posix.h
+@@ -32,6 +32,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define __GTHREADS 1
+ #define __GTHREADS_CXX0X 1
+
++/* The following should normally be in a different header file,
++ * but I couldn't find the right location. The point of the macro
++ * definition below is to prevent libsupc++ and libstdc++ to reference
++ * weak symbols in their static C++ constructors. Such code crashes
++ * when a shared object linked statically to these libraries is
++ * loaded on Android 2.1 (Eclair) and older platform releases, due
++ * to a dynamic linker bug.
++ */
++#ifdef __ANDROID__
++#undef GTHREAD_USE_WEAK
++#define GTHREAD_USE_WEAK 0
++#endif
++
+ #include <pthread.h>
+
+ #if ((defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)) \
+diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
+index 41797a971..f746e8353 100755
+--- a/libstdc++-v3/configure
++++ b/libstdc++-v3/configure
+@@ -78319,6 +78341,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ /* end confdefs.h. */
+ #include <sys/syscall.h>
+ int lk;
++#if !defined(SYS_gettid)
++#define SYS_gettid __NR_gettid
++#endif
++#if !defined(SYS_futex)
++#define SYS_futex __NR_futex
++#endif
+ int
+ main ()
+ {
+@@ -78377,6 +78405,12 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ /* end confdefs.h. */
+ #include <sys/syscall.h>
+ int lk;
++#if !defined(SYS_gettid)
++#define SYS_gettid __NR_gettid
++#endif
++#if !defined(SYS_futex)
++#define SYS_futex __NR_futex
++#endif
+ int
+ main ()
+ {
+diff --git a/libstdc++-v3/include/bits/locale_facets.h b/libstdc++-v3/include/bits/locale_facets.h
+index e3e206b7d..e85dc2c76 100644
+--- a/libstdc++-v3/include/bits/locale_facets.h
++++ b/libstdc++-v3/include/bits/locale_facets.h
+@@ -47,6 +47,20 @@
+ #include <ext/numeric_traits.h>
+ #include <bits/streambuf_iterator.h>
+
++#if !__clang__ && __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __i386__
++// CrystaX: for some reason, x86 gcc-4.9 makes ctype<char>::do_widen() and
++// ctype<char>::_M_widen_init() methods working wrong if optimization enabled.
++// For ctype<char>::do_widen(), values of passed arguments (__lo, __hi and __to)
++// are completely messed up and don't correspond to passed values. In case if
++// we disable optimization for those methods, things become correct so we apply
++// this workaround here for a time.
++// TODO: figure out what exactly wrong here - is it bug in GCC optimization
++// algorithm or smth else?
++#define __CRYSTAX_X86_DONT_OPTIMIZE __attribute__((optimize(0)))
++#else
++#define __CRYSTAX_X86_DONT_OPTIMIZE
++#endif
++
+ namespace std _GLIBCXX_VISIBILITY(default)
+ {
+ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+@@ -1102,7 +1116,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+ * @return @a __hi.
+ */
+ virtual const char*
+- do_widen(const char* __lo, const char* __hi, char_type* __to) const
++ do_widen(const char* __lo, const char* __hi, char_type* __to) const __CRYSTAX_X86_DONT_OPTIMIZE
+ {
+ __builtin_memcpy(__to, __lo, __hi - __lo);
+ return __hi;
+@@ -1163,7 +1177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
+
+ private:
+ void _M_narrow_init() const;
+- void _M_widen_init() const;
++ void _M_widen_init() const __CRYSTAX_X86_DONT_OPTIMIZE;
+ };
+
+ #ifdef _GLIBCXX_USE_WCHAR_T
+diff --git a/libstdc++-v3/libsupc++/guard.cc b/libstdc++-v3/libsupc++/guard.cc
+index 9b617998f..c149169bb 100644
+--- a/libstdc++-v3/libsupc++/guard.cc
++++ b/libstdc++-v3/libsupc++/guard.cc
+@@ -33,7 +33,12 @@
+ #if defined(__GTHREADS) && defined(__GTHREAD_HAS_COND) \
+ && (ATOMIC_INT_LOCK_FREE > 1) && defined(_GLIBCXX_HAVE_LINUX_FUTEX)
+ # include <climits>
++#if defined(__ANDROID__)
++# include <sys/syscall.h>
++# define SYS_futex __NR_futex
++#else
+ # include <syscall.h>
++#endif
+ # include <unistd.h>
+ # define _GLIBCXX_USE_FUTEX
+ # define _GLIBCXX_FUTEX_WAIT 0
diff --git a/patches/gcc/linaro-6.3-2017.02/971-crystax.patch b/patches/gcc/linaro-6.3-2017.02/971-crystax.patch
new file mode 100644
index 0000000..748a381
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/971-crystax.patch
@@ -0,0 +1,25 @@
+commit 9f057b62caafe08c968103d39b5df82486a175c2
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Thu Aug 13 16:11:54 2015 +0300
+
+ [android] Add additional multilib option: mfloat-abi=hard
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/arm/t-linux-androideabi b/gcc/config/arm/t-linux-androideabi
+index 8f1307c55..cbbec5bd2 100644
+--- a/gcc/config/arm/t-linux-androideabi
++++ b/gcc/config/arm/t-linux-androideabi
+@@ -1,8 +1,9 @@
+-MULTILIB_OPTIONS = march=armv7-a mthumb
+-MULTILIB_DIRNAMES = armv7-a thumb
+-MULTILIB_EXCEPTIONS =
++MULTILIB_OPTIONS = march=armv7-a mthumb mfloat-abi=hard
++MULTILIB_DIRNAMES = armv7-a thumb hard
++MULTILIB_EXCEPTIONS = mfloat-abi=hard* mthumb/mfloat-abi=hard*
+ MULTILIB_MATCHES =
+ MULTILIB_OSDIRNAMES =
++MULTILIB_EXTRA_OPTS = Wl,--no-warn-mismatch
+
+ # The "special" multilib can be used to build native applications for Android,
+ # as opposed to native shared libraries that are then called via JNI.
diff --git a/patches/gcc/linaro-6.3-2017.02/972-crystax.patch b/patches/gcc/linaro-6.3-2017.02/972-crystax.patch
new file mode 100644
index 0000000..b9077be
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/972-crystax.patch
@@ -0,0 +1,302 @@
+commit 44a81ebb7698dac41ffa7acd5e0cc1578e5ab1fd
+Author: H.J. Lu <hongjiu.lu@intel.com>
+Date: Mon Apr 14 15:59:47 2014 -0700
+
+ [android] Always enable --eh-frame-hdr for static executable
+
+ See 5e6cdf76af295c9a39b695ca228cff675e8ff4ae and
+ 23e3137ee2897464b051599b85a09f130d3ad05d
+
+ Change-Id: Ibda473188e5a10f2a0592f2494ad00ad1f91e04b
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config.in b/gcc/config.in
+index 115cb6163..933916833 100644
+--- a/gcc/config.in
++++ b/gcc/config.in
+@@ -2119,6 +2119,12 @@
+ #endif
+
+
++/* Define if your system supports PT_GNU_EH_FRAME for static executable. */
++#ifndef USED_FOR_TARGET
++#undef USE_EH_FRAME_HDR_FOR_STATIC
++#endif
++
++
+ /* Define to 1 if the 'long long' type is wider than 'long' but still
+ efficiently supported by the host hardware. */
+ #ifndef USED_FOR_TARGET
+diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h
+index 093c38bba..54b3e0c91 100644
+--- a/gcc/config/alpha/elf.h
++++ b/gcc/config/alpha/elf.h
+@@ -168,5 +168,9 @@ extern int alpha_this_gpdisp_sequence_number;
+ I imagine that other systems will catch up. In the meantime, it
+ doesn't harm to make sure that the data exists to be used later. */
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h
+index 5ded869d2..5f51ac81d 100644
+--- a/gcc/config/freebsd.h
++++ b/gcc/config/freebsd.h
+@@ -45,8 +45,12 @@ along with GCC; see the file COPYING3. If not see
+ #define LIB_SPEC FBSD_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #ifdef TARGET_LIBC_PROVIDES_SSP
+ #define LINK_SSP_SPEC "%{fstack-protector|fstack-protector-all" \
+diff --git a/gcc/config/gnu-user.h b/gcc/config/gnu-user.h
+index b0bf40a95..d1874bc29 100644
+--- a/gcc/config/gnu-user.h
++++ b/gcc/config/gnu-user.h
+@@ -118,8 +118,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define LIB_SPEC GNU_USER_TARGET_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #undef LINK_GCC_C_SEQUENCE_SPEC
+ #define LINK_GCC_C_SEQUENCE_SPEC \
+diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h
+index 37ecfc43f..a5f1b9955 100644
+--- a/gcc/config/openbsd.h
++++ b/gcc/config/openbsd.h
+@@ -136,8 +136,12 @@ while (0)
+ #define LIB_SPEC OBSD_LIB_SPEC
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
+ #endif
++#endif
+
+ #undef LIB_SPEC
+ #define LIB_SPEC OBSD_LIB_SPEC
+diff --git a/gcc/config/rs6000/sysv4.h b/gcc/config/rs6000/sysv4.h
+index cbf909722..eb2217fad 100644
+--- a/gcc/config/rs6000/sysv4.h
++++ b/gcc/config/rs6000/sysv4.h
+@@ -789,7 +789,11 @@ ENDIAN_SELECT(" -mbig", " -mlittle", DEFAULT_ASM_ENDIAN)
+ -dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}"
+
+ #if defined(HAVE_LD_EH_FRAME_HDR)
+-# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++# ifdef USE_EH_FRAME_HDR_FOR_STATIC
++# define LINK_EH_SPEC "--eh-frame-hdr "
++# else
++# define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++# endif
+ #endif
+
+ #define CPP_OS_LINUX_SPEC "-D__unix__ -D__gnu_linux__ -D__linux__ \
+diff --git a/gcc/config/sol2.h b/gcc/config/sol2.h
+index 5160e1fda..7632a5081 100644
+--- a/gcc/config/sol2.h
++++ b/gcc/config/sol2.h
+@@ -347,7 +347,11 @@ along with GCC; see the file COPYING3. If not see
+ /* Solaris 11 build 135+ implements dl_iterate_phdr. GNU ld needs
+ --eh-frame-hdr to create the required .eh_frame_hdr sections. */
+ #if defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR)
++#ifdef USE_EH_FRAME_HDR_FOR_STATIC
++#define LINK_EH_SPEC "--eh-frame-hdr "
++#else
+ #define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
++#endif
+ #endif /* HAVE_LD_EH_FRAME && TARGET_DL_ITERATE_PHDR */
+ #endif
+
+diff --git a/gcc/configure b/gcc/configure
+index 1c6e3407c..28ad05004 100755
+--- a/gcc/configure
++++ b/gcc/configure
+@@ -934,6 +934,7 @@ enable_fix_cortex_a53_835769
+ enable_fix_cortex_a53_843419
+ with_glibc_version
+ enable_gnu_unique_object
++enable_eh_frame_hdr_for_static
+ enable_linker_build_id
+ enable_default_ssp
+ with_long_double_128
+@@ -1670,6 +1671,9 @@ Optional Features:
+ --enable-gnu-unique-object
+ enable the use of the @gnu_unique_object ELF
+ extension on glibc systems
++ --enable-eh-frame-hdr-for-static
++ enable linker PT_GNU_EH_FRAME support for static
++ executable
+ --enable-linker-build-id
+ compiler will always pass --build-id to linker
+ --enable-default-ssp enable Stack Smashing Protection as default
+@@ -27703,6 +27707,38 @@ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
+
+ $as_echo "#define HAVE_LD_EH_FRAME_HDR 1" >>confdefs.h
+
++ # Check whether --enable-eh-frame-hdr-for-static was given.
++if test "${enable_eh_frame_hdr_for_static+set}" = set; then :
++ enableval=$enable_eh_frame_hdr_for_static; case $enable_eh_frame_hdr_for_static in
++ yes | no) ;;
++ *) as_fn_error "'$enable_eh_frame_hdr_for_static' is an invalid
++value for --enable-eh-frame-hdr-for-static.
++Valid choices are 'yes' and 'no'." "$LINENO" 5 ;;
++ esac
++else
++ # Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
++# Linux kernel.
++ if test x$host = x$build -a x$host = x$target &&
++ ldd --version 2>&1 >/dev/null &&
++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
++ if test "$glibcnum" -ge 2003 ; then
++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
++ if echo "$auvx" | grep AT_PHDR > /dev/null &&
++ echo "$auvx" | grep AT_PHNUM > /dev/null; then
++ enable_eh_frame_hdr_for_static=yes
++ fi
++ fi
++ fi
++fi
++
++ if test x$enable_eh_frame_hdr_for_static = xyes; then
++
++$as_echo "#define USE_EH_FRAME_HDR_FOR_STATIC 1" >>confdefs.h
++
++ fi
+ fi
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_ld_eh_frame_hdr" >&5
+ $as_echo "$gcc_cv_ld_eh_frame_hdr" >&6; }
+diff --git a/gcc/configure.ac b/gcc/configure.ac
+index 6c1dcd9ae..0cf7419e7 100644
+--- a/gcc/configure.ac
++++ b/gcc/configure.ac
+@@ -4828,6 +4828,35 @@ GCC_TARGET_TEMPLATE([HAVE_LD_EH_FRAME_HDR])
+ if test x"$gcc_cv_ld_eh_frame_hdr" = xyes; then
+ AC_DEFINE(HAVE_LD_EH_FRAME_HDR, 1,
+ [Define if your linker supports .eh_frame_hdr.])
++ AC_ARG_ENABLE(eh-frame-hdr-for-static,
++ [AS_HELP_STRING([--enable-eh-frame-hdr-for-static],
++ [enable linker PT_GNU_EH_FRAME support for static executable])],
++ [case $enable_eh_frame_hdr_for_static in
++ yes | no) ;;
++ *) AC_MSG_ERROR(['$enable_eh_frame_hdr_for_static' is an invalid
++value for --enable-eh-frame-hdr-for-static.
++Valid choices are 'yes' and 'no'.]) ;;
++ esac],
++# Only support for glibc 2.3.0 or higher with AT_PHDR/AT_PHNUM from
++# Linux kernel.
++ [[if test x$host = x$build -a x$host = x$target &&
++ ldd --version 2>&1 >/dev/null &&
++ glibcver=`ldd --version 2>/dev/null | sed 's/.* //;q'`; then
++ glibcmajor=`expr "$glibcver" : "\([0-9]*\)"`
++ glibcminor=`expr "$glibcver" : "[2-9]*\.\([0-9]*\)"`
++ glibcnum=`expr $glibcmajor \* 1000 + $glibcminor`
++ if test "$glibcnum" -ge 2003 ; then
++ auvx=`LD_SHOW_AUXV=1 ldd 2>/dev/null`
++ if echo "$auvx" | grep AT_PHDR > /dev/null &&
++ echo "$auvx" | grep AT_PHNUM > /dev/null; then
++ enable_eh_frame_hdr_for_static=yes
++ fi
++ fi
++ fi]])
++ if test x$enable_eh_frame_hdr_for_static = xyes; then
++ AC_DEFINE(USE_EH_FRAME_HDR_FOR_STATIC, 1,
++[Define if your system supports PT_GNU_EH_FRAME for static executable.])
++ fi
+ fi
+ AC_MSG_RESULT($gcc_cv_ld_eh_frame_hdr)
+
+diff --git a/gcc/testsuite/g++.dg/eh/spec3-static.C b/gcc/testsuite/g++.dg/eh/spec3-static.C
+new file mode 100644
+index 000000000..15408effa
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/eh/spec3-static.C
+@@ -0,0 +1,25 @@
++// PR c++/4381
++// Test that exception-specs work properly for classes with virtual bases.
++
++// { dg-do run }
++// { dg-options "-static" }
++
++class Base {};
++
++struct A : virtual public Base
++{
++ A() {}
++};
++
++struct B {};
++
++void func() throw (B,A)
++{
++ throw A();
++}
++
++int main(void)
++{
++ try { func(); }
++ catch (A& a) { }
++}
+diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c
+index f3343fc4f..d42647779 100644
+--- a/libgcc/crtstuff.c
++++ b/libgcc/crtstuff.c
+@@ -88,7 +88,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(BSD_DL_ITERATE_PHDR_AVAILABLE)
+ #include <link.h>
+ # define USE_PT_GNU_EH_FRAME
+@@ -97,7 +98,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) && defined(TARGET_DL_ITERATE_PHDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(__sun__) && defined(__svr4__)
+ #include <link.h>
+ # define USE_PT_GNU_EH_FRAME
+@@ -106,7 +108,8 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
++ && !defined(inhibit_libc) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(__GLIBC__) && __GLIBC__ >= 2
+ #include <link.h>
+ /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
+@@ -121,7 +124,7 @@ call_ ## FUNC (void) \
+ #if defined(OBJECT_FORMAT_ELF) \
+ && !defined(OBJECT_FORMAT_FLAT) \
+ && defined(HAVE_LD_EH_FRAME_HDR) \
+- && !defined(CRTSTUFFT_O) \
++ && (defined(USE_EH_FRAME_HDR_FOR_STATIC) || !defined(CRTSTUFFT_O)) \
+ && defined(inhibit_libc) \
+ && (defined(__GLIBC__) || defined(__gnu_linux__) || defined(__GNU__))
+ /* On systems using glibc, an inhibit_libc build of libgcc is only
diff --git a/patches/gcc/linaro-6.3-2017.02/973-crystax.patch b/patches/gcc/linaro-6.3-2017.02/973-crystax.patch
new file mode 100644
index 0000000..b96ece3
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/973-crystax.patch
@@ -0,0 +1,20 @@
+commit 778a9ef107f51544d583f110e92b75f4d9d79117
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Thu Aug 20 19:11:07 2015 +0300
+
+ [android] Don't use PIE copyrelocs for x86/x86_64
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
+index 3d044e8bd..5c89fcab0 100644
+--- a/gcc/config/i386/i386.c
++++ b/gcc/config/i386/i386.c
+@@ -14631,6 +14631,7 @@ legitimate_pic_address_disp_p (rtx disp)
+ else if (!SYMBOL_REF_FAR_ADDR_P (op0)
+ && (SYMBOL_REF_LOCAL_P (op0)
+ || (HAVE_LD_PIE_COPYRELOC
++ && !TARGET_HAS_BIONIC
+ && flag_pie
+ && !SYMBOL_REF_WEAK (op0)
+ && !SYMBOL_REF_FUNCTION_P (op0)))
diff --git a/patches/gcc/linaro-6.3-2017.02/974-crystax.patch b/patches/gcc/linaro-6.3-2017.02/974-crystax.patch
new file mode 100644
index 0000000..9db4f54
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/974-crystax.patch
@@ -0,0 +1,24 @@
+commit dbeae1190cabad83999f2540523f045acc1bb4ec
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Fri Aug 21 17:41:59 2015 +0300
+
+ [android] Always use gthr-posix.h instead of gthr-default.h
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/libgcc/gthr.h b/libgcc/gthr.h
+index 47a7d061a..67a680f90 100644
+--- a/libgcc/gthr.h
++++ b/libgcc/gthr.h
+@@ -145,7 +145,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ #define GTHREAD_USE_WEAK 1
+ #endif
+ #endif
++#if __ANDROID__
++#include "gthr-posix.h"
++#else
+ #include "gthr-default.h"
++#endif
+
+ #ifndef HIDE_EXPORTS
+ #pragma GCC visibility pop
diff --git a/patches/gcc/linaro-6.3-2017.02/975-crystax.patch b/patches/gcc/linaro-6.3-2017.02/975-crystax.patch
new file mode 100644
index 0000000..9efc2a4
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/975-crystax.patch
@@ -0,0 +1,31 @@
+commit 8a66d422721ae5999737d7825701ff22097d287b
+Author: Andrew Hsieh <andrewhsieh@google.com>
+Date: Mon Apr 14 21:05:51 2014 -0700
+
+ [android] Fix ARM generates insufficient alignment for NEON vst/vld
+
+ See d909af3e2469aad87d5c3e79b93c778fd26c03a9
+
+ Change-Id: Ie1de9f946f397196bb6f1623f5add86933739484
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
+index 5974c65d3..71b2c7aa9 100644
+--- a/gcc/config/arm/arm.c
++++ b/gcc/config/arm/arm.c
+@@ -22403,9 +22403,13 @@ arm_print_operand (FILE *stream, rtx x, int code)
+ memsize = MEM_SIZE (x);
+
+ /* Only certain alignment specifiers are supported by the hardware. */
+- if (memsize == 32 && (align % 32) == 0)
++ /* Note that ARM EABI only guarentees 8-byte stack alignment. While GCC
++ honors stricter alignment of composite type in user code, it doesn't
++ observe the alignment of memory passed as an extra argument for function
++ returning large composite type. See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57271 */
++ if (memsize == 32 && (align % 32) == 0 && !TARGET_AAPCS_BASED)
+ align_bits = 256;
+- else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
++ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0 && !TARGET_AAPCS_BASED)
+ align_bits = 128;
+ else if (memsize >= 8 && (align % 8) == 0)
+ align_bits = 64;
diff --git a/patches/gcc/linaro-6.3-2017.02/976-crystax.patch b/patches/gcc/linaro-6.3-2017.02/976-crystax.patch
new file mode 100644
index 0000000..790d4a9
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/976-crystax.patch
@@ -0,0 +1,21 @@
+commit 89d27bc45ee7325dcfff6748da0f8b9c1dc1f234
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Sat Aug 22 09:55:55 2015 +0300
+
+ [android][i386] Remove throw() declaration from posix_memalign() proto
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/i386/pmm_malloc.h b/gcc/config/i386/pmm_malloc.h
+index a1f98d3d1..3725799be 100644
+--- a/gcc/config/i386/pmm_malloc.h
++++ b/gcc/config/i386/pmm_malloc.h
+@@ -31,7 +31,7 @@
+ #ifndef __cplusplus
+ extern int posix_memalign (void **, size_t, size_t);
+ #else
+-extern "C" int posix_memalign (void **, size_t, size_t) throw ();
++extern "C" int posix_memalign (void **, size_t, size_t);
+ #endif
+
+ static __inline void *
diff --git a/patches/gcc/linaro-6.3-2017.02/977-crystax.patch b/patches/gcc/linaro-6.3-2017.02/977-crystax.patch
new file mode 100644
index 0000000..0211d72
--- /dev/null
+++ b/patches/gcc/linaro-6.3-2017.02/977-crystax.patch
@@ -0,0 +1,33 @@
+commit 9ae82f7cfc1073820092dd9f957559667e77db0d
+Author: Dmitry Moskalchuk <dm@crystax.net>
+Date: Tue Aug 25 09:36:42 2015 +0300
+
+ [android] Explicitly make _Unwind_Resume visible for arm64/mips64
+
+ Signed-off-by: Dmitry Moskalchuk <dm@crystax.net>
+
+diff --git a/gcc/config/aarch64/aarch64-linux-android.h b/gcc/config/aarch64/aarch64-linux-android.h
+index db1288fd0..38bc64d61 100644
+--- a/gcc/config/aarch64/aarch64-linux-android.h
++++ b/gcc/config/aarch64/aarch64-linux-android.h
+@@ -57,4 +57,8 @@
+ #define ENDFILE_SPEC \
+ LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC)
+
++#ifdef IN_LIBGCC2
++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
++#endif
++
+ #endif /* GCC_AARCH64_LINUX_ANDROID_H */
+diff --git a/gcc/config/mips/linux-common.h b/gcc/config/mips/linux-common.h
+index 8bfacf994..262a9a341 100644
+--- a/gcc/config/mips/linux-common.h
++++ b/gcc/config/mips/linux-common.h
+@@ -63,3 +63,7 @@ along with GCC; see the file COPYING3. If not see
+
+ /* The default value isn't sufficient in 64-bit mode. */
+ #define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024)
++
++#ifdef IN_LIBGCC2
++#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((visibility("default")))
++#endif