kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 320a0bf21df43c4494344571e66570d8a8dcb684
parent c6074e188884ca756adfef7b8b7ea0fd1d73b5f8
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Mon, 11 Mar 2019 15:54:04 +0000

make mask<> a specialization of vec<>

Diffstat:
Acmake/aarch64.cmake | 37+++++++++++++++++++++++++++++++++++++
Mcmake/arm.cmake | 11++++++++---
Minclude/kfr/cident.h | 2+-
Minclude/kfr/cometa/numeric.hpp | 2+-
Minclude/kfr/dsp/window.hpp | 2+-
Minclude/kfr/math/impl/logical.hpp | 250++++++++++++++++++++++++++++++++++++++++---------------------------------------
Minclude/kfr/math/impl/select.hpp | 125+++++++++++++++++++++++++++++++++++++++----------------------------------------
Minclude/kfr/math/logical.hpp | 4++--
Minclude/kfr/math/select.hpp | 2+-
Minclude/kfr/simd/impl/backend_clang.hpp | 8++++----
Minclude/kfr/simd/impl/backend_generic.hpp | 27+++++++++++++++------------
Minclude/kfr/simd/impl/basicoperators_generic.hpp | 36++++++++++++++++++------------------
Minclude/kfr/simd/impl/function.hpp | 44++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/simd/impl/simd.hpp | 75++++++---------------------------------------------------------------------
Ainclude/kfr/simd/impl/specialconstants.hpp | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/simd/mask.hpp | 97++-----------------------------------------------------------------------------
Minclude/kfr/simd/operators.hpp | 4+++-
Minclude/kfr/simd/platform.hpp | 9++++++++-
Minclude/kfr/simd/types.hpp | 94+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Minclude/kfr/simd/vec.hpp | 29+++++++++++++++++++++--------
Msources.cmake | 1+
Mtests/CMakeLists.txt | 6++----
Mtests/expression_test.cpp | 7+++++++
Mtests/unit/simd/operators.cpp | 12++++++------
Mtests/unit/simd/vec.cpp | 22+++++++++++++++++++++-
25 files changed, 589 insertions(+), 418 deletions(-)

diff --git a/cmake/aarch64.cmake b/cmake/aarch64.cmake @@ -0,0 +1,37 @@ +# For internal use only + +set (CMAKE_SYSTEM_NAME Linux) +set (CMAKE_SYSTEM_VERSION 1) +set (UNIX True) +set (ARM True) +set (AARCH64 True) +set (CMAKE_SYSTEM_PROCESSOR aarch64) +set (EMULATOR qemu-aarch64) + +include (CMakeForceCompiler) +CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Clang) +CMAKE_FORCE_C_COMPILER (/usr/bin/clang Clang) +set (CMAKE_CXX_COMPILER_WORKS TRUE) +set (CMAKE_C_COMPILER_WORKS TRUE) + +set(TGT_TRIPLET aarch64-linux-gnu) + +set (ARM_ROOT "/usr/${TGT_TRIPLET}/include") +if (NOT GCC_VER) + set (GCC_VER 5.4.0) +endif () +set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/${TGT_TRIPLET} -isystem ${ARM_ROOT}") + +set (ARM_COMMON_FLAGS "-target ${TGT_TRIPLET} -mcpu=cortex-a72 -static") + +set (CMAKE_CXX_FLAGS "${SYS_PATHS} ${ARM_COMMON_FLAGS}") +set (CMAKE_C_FLAGS " ${SYS_PATHS} ${ARM_COMMON_FLAGS}") + +set (CMAKE_CXX_LINK_FLAGS " ${ARM_COMMON_FLAGS} ${CMAKE_CXX_LINK_FLAGS}") +set (CMAKE_C_LINK_FLAGS " ${ARM_COMMON_FLAGS} ${CMAKE_C_LINK_FLAGS}") + +message(STATUS "${ARM_COMMON_FLAGS}") + +set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) +set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/cmake/arm.cmake b/cmake/arm.cmake @@ -1,8 +1,11 @@ +# For internal use only + set (CMAKE_SYSTEM_NAME Linux) set (CMAKE_SYSTEM_VERSION 1) set (UNIX True) set (ARM True) set (CMAKE_SYSTEM_PROCESSOR arm) +set (EMULATOR qemu-arm) include (CMakeForceCompiler) CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Clang) @@ -10,13 +13,15 @@ CMAKE_FORCE_C_COMPILER (/usr/bin/clang Clang) set (CMAKE_CXX_COMPILER_WORKS TRUE) set (CMAKE_C_COMPILER_WORKS TRUE) -set (ARM_ROOT "/usr/arm-linux-gnueabihf/include") +set(TGT_TRIPLET arm-linux-gnueabihf) + +set (ARM_ROOT "/usr/${TGT_TRIPLET}/include") if (NOT GCC_VER) set (GCC_VER 5.4.0) endif () -set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/arm-linux-gnueabihf -isystem ${ARM_ROOT}") +set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/${TGT_TRIPLET} -isystem ${ARM_ROOT}") -set (ARM_COMMON_FLAGS "-target arm-linux-gnueabihf -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static") +set (ARM_COMMON_FLAGS "-target ${TGT_TRIPLET} -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static") set (CMAKE_CXX_FLAGS "${SYS_PATHS} ${ARM_COMMON_FLAGS}") set (CMAKE_C_FLAGS " ${SYS_PATHS} ${ARM_COMMON_FLAGS}") diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -141,7 +141,7 @@ extern char* gets(char* __s); #define CMT_ARCH_BITNESS_NAME "32-bit" #endif -#ifdef __ARM_NEON__ +#if defined __ARM_NEON__ || defined __ARM_NEON #if __ARM_ARCH >= 8 && defined(__aarch64__) #define CMT_ARCH_NEON64 1 diff --git a/include/kfr/cometa/numeric.hpp b/include/kfr/cometa/numeric.hpp @@ -142,7 +142,7 @@ using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>; template <typename T> struct typebits { - static_assert(is_number<deep_subtype<T>>::value, ""); + // static_assert(is_number<deep_subtype<T>>::value, ""); constexpr static size_t bits = sizeof(typename compound_type_traits<T>::subtype) * 8; constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width; using subtype = typename compound_type_traits<T>::subtype; diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp @@ -131,7 +131,7 @@ struct expression_rectangular : input_expression size_t index, vec_shape<T, N>) { using TI = utype<T>; - const vec<TI, N> i = enumerate(vec<TI, N>()) + static_cast<TI>(index); + const vec<TI, N> i = enumerate(vec_shape<TI, N>()) + static_cast<TI>(index); return select(i < static_cast<TI>(self.m_size), T(1), T(0)); } size_t size() const { return m_size; } diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp @@ -39,100 +39,106 @@ namespace intrinsics #if defined CMT_ARCH_SSE41 // horizontal OR -KFR_INTRINSIC bool bittestany(const u8sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const u16sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const u32sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const u64sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i8sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i16sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i32sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i64sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); } // horizontal AND -KFR_INTRINSIC bool bittestall(const u8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const u16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const u32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const u64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } #endif #if defined CMT_ARCH_AVX // horizontal OR -KFR_INTRINSIC bool bittestany(const f32sse& x) { return !_mm_testz_ps(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const f64sse& x) { return !_mm_testz_pd(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const f32avx& x) { return !_mm256_testz_ps(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const f64avx& x) { return !_mm256_testz_pd(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const u8avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const u16avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const u32avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const u64avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i8avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i16avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i32avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const i64avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); } // horizontal AND -KFR_INTRINSIC bool bittestall(const f32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const f64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const f32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const f64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const u8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const u16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const u32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const u64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const i64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } #if defined CMT_ARCH_AVX512 // horizontal OR -KFR_INTRINSIC bool bittestany(const f32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); } -KFR_INTRINSIC bool bittestany(const f64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); } -KFR_INTRINSIC bool bittestany(const u8avx512& x) { return _mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestany(const u16avx512& x) { return _mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestany(const u32avx512& x) { return _mm512_movepi32_mask(x.v); } -KFR_INTRINSIC bool bittestany(const u64avx512& x) { return _mm512_movepi64_mask(x.v); } -KFR_INTRINSIC bool bittestany(const i8avx512& x) { return _mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestany(const i16avx512& x) { return _mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestany(const i32avx512& x) { return _mm512_movepi32_mask(x.v); } -KFR_INTRINSIC bool bittestany(const i64avx512& x) { return _mm512_movepi64_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); } +KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); } +KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); } // horizontal AND -KFR_INTRINSIC bool bittestall(const f32avx512& x) { return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); } -KFR_INTRINSIC bool bittestall(const f64avx512& x) { return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); } -KFR_INTRINSIC bool bittestall(const u8avx512& x) { return !~_mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestall(const u16avx512& x) { return !~_mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestall(const u32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } -KFR_INTRINSIC bool bittestall(const u64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } -KFR_INTRINSIC bool bittestall(const i8avx512& x) { return !~_mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestall(const i16avx512& x) { return !~_mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestall(const i32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } -KFR_INTRINSIC bool bittestall(const i64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const mf32avx512& x) +{ + return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); +} +KFR_INTRINSIC bool bittestall(const mf64avx512& x) +{ + return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); +} +KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } #endif #elif defined CMT_ARCH_SSE41 -KFR_INTRINSIC bool bittestany(const f32sse& x) +KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v); } -KFR_INTRINSIC bool bittestany(const f64sse& x) +KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v); } -KFR_INTRINSIC bool bittestall(const f32sse& x) +KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v); } -KFR_INTRINSIC bool bittestall(const f64sse& x) +KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v); } @@ -140,100 +146,100 @@ KFR_INTRINSIC bool bittestall(const f64sse& x) #if !defined CMT_ARCH_SSE41 -KFR_INTRINSIC bool bittestany(const f32sse& x) { return _mm_movemask_ps(x.v); } -KFR_INTRINSIC bool bittestany(const f64sse& x) { return _mm_movemask_pd(x.v); } -KFR_INTRINSIC bool bittestany(const u8sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const u16sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const u32sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const u64sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const i8sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const i16sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const i32sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const i64sse& x) { return _mm_movemask_epi8(x.v); } - -KFR_INTRINSIC bool bittestall(const f32sse& x) { return !_mm_movemask_ps((~x).v); } -KFR_INTRINSIC bool bittestall(const f64sse& x) { return !_mm_movemask_pd((~x).v); } -KFR_INTRINSIC bool bittestall(const u8sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const u16sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const u32sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const u64sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const i8sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const i16sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const i32sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const i64sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); } +KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); } +KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); } + +KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); } +KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); } +KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); } #endif template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) { - return bittestall(expand_simd(a, internal::maskbits<T>(true))); + return bittestall(expand_simd(a, bit<T>(true))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) { return bittestall(low(a)) && bittestall(high(a)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) { - return bittestany(expand_simd(a, internal::maskbits<T>(false))); + return bittestany(expand_simd(a, bit<T>(false))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) { return bittestany(low(a)) || bittestany(high(a)); } #elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS -KFR_INTRINSIC bool bittestall(const u32neon& a) +KFR_INTRINSIC bool bittestall(const mu32neon& a) { const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v)); return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; } -KFR_INTRINSIC bool bittestany(const u32neon& a) +KFR_INTRINSIC bool bittestany(const mu32neon& a) { const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v)); return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; } -KFR_INTRINSIC bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); } - -KFR_INTRINSIC bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_INTRINSIC bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<u32>(a)); } + +KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<u32>(a)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) { - return bittestall(expand_simd(a, internal::maskbits<T>(true))); + return bittestall(expand_simd(a, bit<T>(true))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) { return bittestall(low(a)) && bittestall(high(a)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) { - return bittestany(expand_simd(a, internal::maskbits<T>(false))); + return bittestany(expand_simd(a, bit<T>(false))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) { return bittestany(low(a)) || bittestany(high(a)); } @@ -241,34 +247,34 @@ KFR_INTRINSIC bool bittestany(const vec<T, N>& a) #else template <typename T, size_t N> -KFR_INTRINSIC bitmask<N> getmask(const vec<T, N>& x) +KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x) { typename bitmask<N>::type val = 0; for (size_t i = 0; i < N; i++) { - val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i; + val |= static_cast<int>(x[i]) << i; } return val; } template <typename T, size_t N> -KFR_INTRINSIC bool bittestany(const vec<T, N>& x) +KFR_INTRINSIC bool bittestany(const mask<T, N>& x) { return getmask(x).value; } template <typename T, size_t N> -KFR_INTRINSIC bool bittestany(const vec<T, N>& x, const vec<T, N>& y) +KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y) { return bittestany(x & y); } template <typename T, size_t N> -KFR_INTRINSIC bool bittestall(const vec<T, N>& x) +KFR_INTRINSIC bool bittestall(const mask<T, N>& x) { return !getmask(~x).value; } template <typename T, size_t N> -KFR_INTRINSIC bool bittestall(const vec<T, N>& x, const vec<T, N>& y) +KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y) { return !bittestany(~x & y); } diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp @@ -35,265 +35,262 @@ namespace intrinsics #if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS -KFR_INTRINSIC u8sse select(const u8sse& m, const u8sse& x, const u8sse& y) +KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC u16sse select(const u16sse& m, const u16sse& x, const u16sse& y) +KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC u32sse select(const u32sse& m, const u32sse& x, const u32sse& y) +KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC u64sse select(const u64sse& m, const u64sse& x, const u64sse& y) +KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i8sse select(const i8sse& m, const i8sse& x, const i8sse& y) +KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i16sse select(const i16sse& m, const i16sse& x, const i16sse& y) +KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i32sse select(const i32sse& m, const i32sse& x, const i32sse& y) +KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i64sse select(const i64sse& m, const i64sse& x, const i64sse& y) +KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y) { return _mm_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC f32sse select(const f32sse& m, const f32sse& x, const f32sse& y) +KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y) { return _mm_blendv_ps(y.v, x.v, m.v); } -KFR_INTRINSIC f64sse select(const f64sse& m, const f64sse& x, const f64sse& y) +KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y) { return _mm_blendv_pd(y.v, x.v, m.v); } #if defined CMT_ARCH_AVX -KFR_INTRINSIC f64avx select(const f64avx& m, const f64avx& x, const f64avx& y) +KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y) { return _mm256_blendv_pd(y.v, x.v, m.v); } -KFR_INTRINSIC f32avx select(const f32avx& m, const f32avx& x, const f32avx& y) +KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y) { return _mm256_blendv_ps(y.v, x.v, m.v); } #endif #if defined CMT_ARCH_AVX2 -KFR_INTRINSIC u8avx select(const u8avx& m, const u8avx& x, const u8avx& y) +KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC u16avx select(const u16avx& m, const u16avx& x, const u16avx& y) +KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC u32avx select(const u32avx& m, const u32avx& x, const u32avx& y) +KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC u64avx select(const u64avx& m, const u64avx& x, const u64avx& y) +KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i8avx select(const i8avx& m, const i8avx& x, const i8avx& y) +KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i16avx select(const i16avx& m, const i16avx& x, const i16avx& y) +KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i32avx select(const i32avx& m, const i32avx& x, const i32avx& y) +KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } -KFR_INTRINSIC i64avx select(const i64avx& m, const i64avx& x, const i64avx& y) +KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y) { return _mm256_blendv_epi8(y.v, x.v, m.v); } #endif #if defined CMT_ARCH_AVX512 -KFR_INTRINSIC f64avx512 select(const f64avx512& m, const f64avx512& x, const f64avx512& y) +KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y) { return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v); } -KFR_INTRINSIC f32avx512 select(const f32avx512& m, const f32avx512& x, const f32avx512& y) +KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y) { return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v); } -KFR_INTRINSIC u8avx512 select(const u8avx512& m, const u8avx512& x, const u8avx512& y) +KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y) { return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); } -KFR_INTRINSIC u16avx512 select(const u16avx512& m, const u16avx512& x, const u16avx512& y) +KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y) { return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); } -KFR_INTRINSIC u32avx512 select(const u32avx512& m, const u32avx512& x, const u32avx512& y) +KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y) { return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); } -KFR_INTRINSIC u64avx512 select(const u64avx512& m, const u64avx512& x, const u64avx512& y) +KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y) { return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); } -KFR_INTRINSIC i8avx512 select(const i8avx512& m, const i8avx512& x, const i8avx512& y) +KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y) { return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); } -KFR_INTRINSIC i16avx512 select(const i16avx512& m, const i16avx512& x, const i16avx512& y) +KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y) { return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); } -KFR_INTRINSIC i32avx512 select(const i32avx512& m, const i32avx512& x, const i32avx512& y) +KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y) { return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); } -KFR_INTRINSIC i64avx512 select(const i64avx512& m, const i64avx512& x, const i64avx512& y) +KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y) { return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); } #endif template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) { constexpr size_t Nout = next_simd_width<T>(N); return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) .shuffle(csizeseq<N>); } template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) { - vec<T, N> r; - intrin(r, a, b, c, [](auto x, auto y, auto z) { return intrinsics::select(x, y, z); }); - return r; - // return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c))); + return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c))); // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) { constexpr size_t Nout = next_simd_width<T>(N); return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>); } template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) { return concat2(select(a.h.low, b, c), select(a.h.high, b, c)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c) { constexpr size_t Nout = next_simd_width<T>(N); return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>); } template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c) { return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c) { constexpr size_t Nout = next_simd_width<T>(N); return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>)) .shuffle(csizeseq<N>); } template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c) { return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high)); } #elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS -KFR_INTRINSIC f32neon select(const f32neon& m, const f32neon& x, const f32neon& y) +KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y) { return vbslq_f32(m.v, x.v, y.v); } -KFR_INTRINSIC i8neon select(const i8neon& m, const i8neon& x, const i8neon& y) +KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y) { return vbslq_s8(m.v, x.v, y.v); } -KFR_INTRINSIC u8neon select(const u8neon& m, const u8neon& x, const u8neon& y) +KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y) { return vbslq_u8(m.v, x.v, y.v); } -KFR_INTRINSIC i16neon select(const i16neon& m, const i16neon& x, const i16neon& y) +KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y) { return vbslq_s16(m.v, x.v, y.v); } -KFR_INTRINSIC u16neon select(const u16neon& m, const u16neon& x, const u16neon& y) +KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y) { return vbslq_u16(m.v, x.v, y.v); } -KFR_INTRINSIC i32neon select(const i32neon& m, const i32neon& x, const i32neon& y) +KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y) { return vbslq_s32(m.v, x.v, y.v); } -KFR_INTRINSIC u32neon select(const u32neon& m, const u32neon& x, const u32neon& y) +KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y) { return vbslq_u32(m.v, x.v, y.v); } -KFR_INTRINSIC i64neon select(const i64neon& m, const i64neon& x, const i64neon& y) +KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y) { return vbslq_s64(m.v, x.v, y.v); } -KFR_INTRINSIC u64neon select(const u64neon& m, const u64neon& x, const u64neon& y) +KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y) { return vbslq_u64(m.v, x.v, y.v); } #ifdef CMT_ARCH_NEON64 -KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y) +KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y) { return vbslq_f64(m.v, x.v, y.v); } #else -KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y) +KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y) { - return y ^ ((x ^ y) & m); + return y ^ ((x ^ y) & m.asvec()); } #endif template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) { constexpr size_t Nout = next_simd_width<T>(N); return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) .shuffle(csizeseq<N>); } template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); } template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y) { return select(m, vec<T, N>(x), vec<T, N>(y)); } template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y) { return select(m, x, vec<T, N>(y)); } template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y) { return select(m, vec<T, N>(x), y); } @@ -302,22 +299,22 @@ KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& // fallback template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y) { - return y ^ ((x ^ y) & m); + return y ^ ((x ^ y) & m.asvec()); } template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y) { return select(m, vec<T, N>(x), vec<T, N>(y)); } template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y) { return select(m, x, vec<T, N>(y)); } template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y) +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y) { return select(m, vec<T, N>(x), y); } diff --git a/include/kfr/math/logical.hpp b/include/kfr/math/logical.hpp @@ -39,7 +39,7 @@ inline namespace CMT_ARCH_NAME template <typename T, size_t N> KFR_INTRINSIC bool all(const mask<T, N>& x) { - return intrinsics::bittestall(x.asvec()); + return intrinsics::bittestall(x); } /** @@ -48,7 +48,7 @@ KFR_INTRINSIC bool all(const mask<T, N>& x) template <typename T, size_t N> KFR_INTRINSIC bool any(const mask<T, N>& x) { - return intrinsics::bittestany(x.asvec()); + return intrinsics::bittestany(x); } } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/math/select.hpp b/include/kfr/math/select.hpp @@ -43,7 +43,7 @@ template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_nume KFR_INTRINSIC vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y) { static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types"); - return intrinsics::select(bitcast<Tout>(m.asvec()), innercast<Tout>(x), innercast<Tout>(y)); + return intrinsics::select(bitcast<Tout>(m.asvec()).asmask(), innercast<Tout>(x), innercast<Tout>(y)); } /** diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp @@ -36,7 +36,7 @@ namespace intrinsics { template <typename TT, size_t NN> -using simd = TT __attribute__((ext_vector_type(NN))); +using simd = unwrap_bit<TT> __attribute__((ext_vector_type(NN))); template <typename T, size_t N1> KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x); @@ -51,13 +51,13 @@ KFR_INTRINSIC void simd_make(ctype_t<Tout>) = delete; template <typename Tout, typename Arg> KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) { - return (simd<Tout, 1>){ static_cast<Tout>(arg) }; + return (simd<Tout, 1>){ static_cast<unwrap_bit<Tout>>(arg) }; } template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)> KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args) { - return (simd<Tout, N>){ static_cast<Tout>(args)... }; + return (simd<Tout, N>){ static_cast<unwrap_bit<Tout>>(args)... }; } /// @brief Returns vector with undefined value @@ -111,7 +111,7 @@ KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x) template <typename T, size_t N> KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value) { - return value; + return static_cast<unwrap_bit<T>>(value); } template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)> diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp @@ -49,7 +49,7 @@ namespace intrinsics { template <typename T, size_t N> -using simd = typename simd_type<T, N>::type; +using simd = typename simd_type<unwrap_bit<T>, N>::type; template <typename T, size_t N, typename U> union simd_small_array { @@ -767,7 +767,7 @@ KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT template <typename T, size_t N, size_t... indices> KFR_INTRINSIC simd<T, N> from_simd_array_impl(const simd_array<T, N>& x, csizes_t<indices...>) CMT_NOEXCEPT { - return { x.val[indices]... }; + return { static_cast<unwrap_bit<T>>(x.val[indices])... }; } template <typename T, size_t N, KFR_ENABLE_IF(is_simd_small_array<simd<T, N>>::value)> @@ -806,7 +806,7 @@ KFR_INTRINSIC void simd_make(ctype_t<Tout>) CMT_NOEXCEPT = delete; template <typename Tout, typename Arg> KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) CMT_NOEXCEPT { - return simd<Tout, 1>{ static_cast<Tout>(arg) }; + return simd<Tout, 1>{ static_cast<unwrap_bit<Tout>>(static_cast<Tout>(arg)) }; } template <typename T, size_t... indices, typename... Args, size_t N = sizeof...(indices)> @@ -931,7 +931,7 @@ KFR_INTRINSIC simd<T, N + N> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, template <typename T> KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_t<T, 1>, identity<T> value) CMT_NOEXCEPT { - return { value }; + return { static_cast<unwrap_bit<T>>(value) }; } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2), size_t Nlow = prev_poweroftwo(N - 1)> @@ -964,7 +964,7 @@ simd_array<T, Nout> simd_shuffle_generic(const simd_array<T, N>& x, const unsign for (size_t i = 0; i < Nout; ++i) { const size_t index = indices[i]; - result.val[i] = index >= N ? T() : x.val[index]; + result.val[i] = index >= N ? T() : static_cast<T>(x.val[index]); } return result; } @@ -977,7 +977,9 @@ simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd for (size_t i = 0; i < Nout; ++i) { const size_t index = indices[i]; - result.val[i] = index > N1 + N2 ? T() : index >= N1 ? y.val[index - N1] : x.val[index]; + result.val[i] = index >= N1 + N2 + ? T() + : index >= N1 ? static_cast<T>(y.val[index - N1]) : static_cast<T>(x.val[index]); } return result; } @@ -992,7 +994,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csiz constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, N>(xx, indices_array)); #else - return from_simd_array<T, Nout>({ (indices > N ? T() : to_simd_array<T, N>(x).val[indices])... }); + return from_simd_array<T, Nout>( + { (indices >= N ? T() : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... }); #endif } @@ -1009,9 +1012,9 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N, N>(xx, yy, indices_array)); #else return from_simd_array<T, Nout>( - { (indices > N * 2 ? T() - : indices >= N ? to_simd_array<T, N>(y).val[indices - N] - : to_simd_array<T, N>(x).val[indices])... }); + { (indices >= N * 2 ? T() + : indices >= N ? static_cast<T>(to_simd_array<T, N>(y).val[indices - N]) + : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... }); #endif } @@ -1031,8 +1034,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& return from_simd_array<T, Nout>( { (indices > N1 + N2 ? T() - : indices >= N1 ? to_simd_array<T, N2>(y).val[indices - N1] - : to_simd_array<T, N1>(x).val[indices])... }); + : indices >= N1 ? static_cast<T>(to_simd_array<T, N2>(y).val[indices - N1]) + : static_cast<T>(to_simd_array<T, N1>(x).val[indices]))... }); #endif } diff --git a/include/kfr/simd/impl/basicoperators_generic.hpp b/include/kfr/simd/impl/basicoperators_generic.hpp @@ -412,19 +412,19 @@ KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) { return _mm_not_si128 #else KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y) { - KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] == y[i])); + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] == y[i])); } KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y) { - KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] == y[i])); + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] == y[i])); } KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y) { - KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] != y[i])); + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] != y[i])); } KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) { - KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] != y[i])); + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] != y[i])); } #endif @@ -458,35 +458,35 @@ KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y) #else KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y) { - KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] > y[i])); + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] > y[i])); } KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y) { - KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] > y[i])); + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] > y[i])); } KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y) { - KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] < y[i])); + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] < y[i])); } KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y) { - KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] < y[i])); + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] < y[i])); } KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y) { - KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] >= y[i])); + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] >= y[i])); } KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y) { - KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] >= y[i])); + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] >= y[i])); } KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y) { - KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] <= y[i])); + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] <= y[i])); } KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y) { - KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] <= y[i])); + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] <= y[i])); } #endif @@ -1557,32 +1557,32 @@ KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y) template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y) { - KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] == y[i])); + KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] == y[i])); } template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y) { - KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] != y[i])); + KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] != y[i])); } template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y) { - KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] >= y[i])); + KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] >= y[i])); } template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y) { - KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] <= y[i])); + KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] <= y[i])); } template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y) { - KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] > y[i])); + KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] > y[i])); } template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y) { - KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] < y[i])); + KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] < y[i])); } template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> diff --git a/include/kfr/simd/impl/function.hpp b/include/kfr/simd/impl/function.hpp @@ -105,6 +105,39 @@ using u16avx512 = vec<u16, 32>; using u32avx512 = vec<u32, 16>; using u64avx512 = vec<u64, 8>; +using mf32sse = mask<f32, 4>; +using mf64sse = mask<f64, 2>; +using mi8sse = mask<i8, 16>; +using mi16sse = mask<i16, 8>; +using mi32sse = mask<i32, 4>; +using mi64sse = mask<i64, 2>; +using mu8sse = mask<u8, 16>; +using mu16sse = mask<u16, 8>; +using mu32sse = mask<u32, 4>; +using mu64sse = mask<u64, 2>; + +using mf32avx = mask<f32, 8>; +using mf64avx = mask<f64, 4>; +using mi8avx = mask<i8, 32>; +using mi16avx = mask<i16, 16>; +using mi32avx = mask<i32, 8>; +using mi64avx = mask<i64, 4>; +using mu8avx = mask<u8, 32>; +using mu16avx = mask<u16, 16>; +using mu32avx = mask<u32, 8>; +using mu64avx = mask<u64, 4>; + +using mf32avx512 = mask<f32, 16>; +using mf64avx512 = mask<f64, 8>; +using mi8avx512 = mask<i8, 64>; +using mi16avx512 = mask<i16, 32>; +using mi32avx512 = mask<i32, 16>; +using mi64avx512 = mask<i64, 8>; +using mu8avx512 = mask<u8, 64>; +using mu16avx512 = mask<u16, 32>; +using mu32avx512 = mask<u32, 16>; +using mu64avx512 = mask<u64, 8>; + #else using f32neon = vec<f32, 4>; using f64neon = vec<f64, 2>; @@ -116,6 +149,17 @@ using u8neon = vec<u8, 16>; using u16neon = vec<u16, 8>; using u32neon = vec<u32, 4>; using u64neon = vec<u64, 2>; + +using mf32neon = mask<f32, 4>; +using mf64neon = mask<f64, 2>; +using mi8neon = mask<i8, 16>; +using mi16neon = mask<i16, 8>; +using mi32neon = mask<i32, 4>; +using mi64neon = mask<i64, 2>; +using mu8neon = mask<u8, 16>; +using mu16neon = mask<u16, 8>; +using mu32neon = mask<u32, 4>; +using mu64neon = mask<u64, 2>; #endif template <typename T> diff --git a/include/kfr/simd/impl/simd.hpp b/include/kfr/simd/impl/simd.hpp @@ -30,75 +30,6 @@ namespace kfr inline namespace CMT_ARCH_NAME { -#if defined CMT_COMPILER_GNU -constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); } -constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); } -constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); } -constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); } -#elif defined CMT_COMPILER_MSVC -constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); } -constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); } -constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); } -constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); } -#else -inline f32 allones_f32() CMT_NOEXCEPT -{ - return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu))); -} -inline f64 allones_f64() CMT_NOEXCEPT -{ - return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull))); -} -inline f32 invhighbit_f32() CMT_NOEXCEPT -{ - return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu))); -} -inline f64 invhighbit_f64() CMT_NOEXCEPT -{ - return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull))); -} -#endif - -template <typename T> -struct special_scalar_constants -{ - constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); } - constexpr static T allones() { return static_cast<T>(-1ll); } - constexpr static T allzeros() { return T(0); } - constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); } -}; - -#ifndef CMT_COMPILER_INTEL -#define KFR_CONSTEXPR_NON_INTEL constexpr -#else -#define KFR_CONSTEXPR_NON_INTEL -#endif - -template <> -struct special_scalar_constants<float> -{ - constexpr static float highbitmask() { return -0.f; } - KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); }; - constexpr static float allzeros() { return 0.f; } - KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); } -}; - -template <> -struct special_scalar_constants<double> -{ - constexpr static double highbitmask() { return -0.; } - KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); }; - constexpr static double allzeros() { return 0.; } - KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); } -}; - -template <typename T> -struct special_constants : public special_scalar_constants<subtype<T>> -{ -public: - using Tsub = subtype<T>; -}; - namespace intrinsics { @@ -142,6 +73,12 @@ struct alignas(alignment<T, N>()) simd_array }; template <typename T, size_t N> +struct alignas(alignment<T, N>()) simd_array<bit<T>, N> +{ + bit_value<T> val[next_poweroftwo(N)]; +}; + +template <typename T, size_t N> struct simd_type; template <typename T> diff --git a/include/kfr/simd/impl/specialconstants.hpp b/include/kfr/simd/impl/specialconstants.hpp @@ -0,0 +1,101 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../cometa/numeric.hpp" +#include "intrinsics.h" + +namespace kfr +{ +using namespace cometa; + +#if defined CMT_COMPILER_GNU +constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); } +constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); } +constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); } +constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); } +#elif defined CMT_COMPILER_MSVC +constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); } +constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); } +constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); } +constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); } +#else +inline f32 allones_f32() CMT_NOEXCEPT +{ + return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu))); +} +inline f64 allones_f64() CMT_NOEXCEPT +{ + return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull))); +} +inline f32 invhighbit_f32() CMT_NOEXCEPT +{ + return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu))); +} +inline f64 invhighbit_f64() CMT_NOEXCEPT +{ + return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull))); +} +#endif + +template <typename T> +struct special_scalar_constants +{ + constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); } + constexpr static T allones() { return static_cast<T>(-1ll); } + constexpr static T allzeros() { return T(0); } + constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); } +}; + +#ifndef CMT_COMPILER_INTEL +#define KFR_CONSTEXPR_NON_INTEL constexpr +#else +#define KFR_CONSTEXPR_NON_INTEL +#endif + +template <> +struct special_scalar_constants<float> +{ + constexpr static float highbitmask() { return -0.f; } + KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); }; + constexpr static float allzeros() { return 0.f; } + KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); } +}; + +template <> +struct special_scalar_constants<double> +{ + constexpr static double highbitmask() { return -0.; } + KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); }; + constexpr static double allzeros() { return 0.; } + KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); } +}; + +template <typename T> +struct special_constants : public special_scalar_constants<subtype<T>> +{ +public: + using Tsub = subtype<T>; +}; + +} // namespace kfr diff --git a/include/kfr/simd/mask.hpp b/include/kfr/simd/mask.hpp @@ -39,60 +39,6 @@ using maskfor = typename T::mask_t; namespace internal { -template <typename T> -constexpr inline T maskbits(bool value) -{ - return value ? special_constants<T>::allones() : special_constants<T>::allzeros(); -} -} // namespace internal - -template <typename T, size_t N> -struct mask : protected vec<T, N> -{ - using base = vec<T, N>; - - KFR_MEM_INTRINSIC mask() CMT_NOEXCEPT = default; - - KFR_MEM_INTRINSIC mask(const mask&) CMT_NOEXCEPT = default; - - KFR_MEM_INTRINSIC mask& operator=(const mask&) CMT_NOEXCEPT = default; - - using simd_type = typename base::simd_type; - - KFR_MEM_INTRINSIC mask(bool arg) : base(internal::maskbits<T>(arg)) {} - - template <typename... Args> - KFR_MEM_INTRINSIC mask(bool arg1, bool arg2, Args... args) - : base(internal::maskbits<T>(arg1), internal::maskbits<T>(arg2), - internal::maskbits<T>(static_cast<bool>(args))...) - { - } - - using vec<T, N>::v; - - KFR_MEM_INTRINSIC mask(const base& v) CMT_NOEXCEPT; - - KFR_MEM_INTRINSIC mask(const simd_type& simd) : base(simd) {} - - template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))> - KFR_MEM_INTRINSIC mask(const mask<U, N>& m) : base(base::frombits(m.asvec())) - { - } - - template <typename U, KFR_ENABLE_IF(sizeof(T) != sizeof(U))> - KFR_MEM_INTRINSIC mask(const mask<U, N>& m) - : base(base::frombits(innercast<itype<T>>(vec<itype<U>, N>::frombits(m.asvec())))) - { - } - - KFR_MEM_INTRINSIC bool operator[](size_t index) const CMT_NOEXCEPT; - - KFR_MEM_INTRINSIC constexpr base asvec() const CMT_NOEXCEPT { return base(v); } -}; - -namespace internal -{ - template <typename T, size_t Nout, size_t N1, size_t... indices> constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>) { @@ -106,50 +52,11 @@ constexpr vec<T, Nout> partial_mask() } } // namespace internal -template <typename T, size_t N> -KFR_MEM_INTRINSIC bool mask<T, N>::operator[](size_t index) const CMT_NOEXCEPT -{ - return ibitcast(base::operator[](index)) < 0; -} - template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)> -constexpr KFR_INTRINSIC mask<T, Nout> make_mask(bool arg, Args... args) +constexpr KFR_INTRINSIC vec<bit<T>, Nout> make_mask(bool arg, Args... args) { - return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...); + return vec<bit<T>, Nout>(arg, static_cast<bool>(args)...); } } // namespace CMT_ARCH_NAME } // namespace kfr - -namespace cometa -{ - -template <typename T, size_t N> -struct compound_type_traits<kfr::mask<T, N>> -{ - using subtype = T; - using deep_subtype = cometa::deep_subtype<T>; - constexpr static size_t width = N; - constexpr static size_t deep_width = width * compound_type_traits<T>::width; - constexpr static bool is_scalar = false; - constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; - template <typename U> - using rebind = kfr::mask<U, N>; - template <typename U> - using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; - - KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::mask<T, N>& value, size_t index) - { - return value[index]; - } -}; -} // namespace cometa - -namespace std -{ -template <typename T1, typename T2, size_t N> -struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>> -{ - using type = kfr::mask<typename common_type<T1, T2>::type, N>; -}; -} // namespace std diff --git a/include/kfr/simd/operators.hpp b/include/kfr/simd/operators.hpp @@ -800,11 +800,13 @@ vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec< KFR_FN(packtranspose) +#if 0 template <typename T, size_t N> -KFR_I_CE mask<T, N>::mask(const base& v) CMT_NOEXCEPT +KFR_I_CE vec<bit<T>, N>::vec(const base& v) CMT_NOEXCEPT { this->v = base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()).v; } +#endif } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp @@ -161,6 +161,8 @@ struct platform<cpu_t::common> constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; constexpr static bool fast_unaligned = false; + + constexpr static bool mask_registers = false; }; template <> struct platform<cpu_t::sse2> : platform<cpu_t::common> @@ -208,6 +210,8 @@ struct platform<cpu_t::avx512> : platform<cpu_t::avx2> constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; constexpr static size_t simd_register_count = bitness_const(8, 32); + + constexpr static bool mask_registers = true; }; #endif #ifdef CMT_ARCH_ARM @@ -234,6 +238,8 @@ struct platform<cpu_t::common> constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; constexpr static bool fast_unaligned = false; + + constexpr static bool mask_registers = false; }; template <> struct platform<cpu_t::neon> : platform<cpu_t::common> @@ -279,8 +285,9 @@ constexpr static bool is_simd_size(size_t size) template <typename T, size_t N = vector_width<T>> struct vec; + template <typename T, size_t N = vector_width<T>> -struct mask; +using mask = vec<bit<T>, N>; } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp @@ -28,6 +28,7 @@ #include "../kfr.h" #include "impl/intrinsics.h" +#include "impl/specialconstants.hpp" #include <climits> @@ -40,8 +41,8 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers") #ifdef KFR_TESTING -#include "../testo/testo.hpp" #include "../cometa/function.hpp" +#include "../testo/testo.hpp" #endif #include "../cometa.hpp" @@ -161,7 +162,8 @@ constexpr size_t max_test_size = 32; template <template <typename, size_t> class vec_tpl, typename T, typename sizes = #ifdef KFR_EXTENDED_TESTS - cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)> + cfilter_t<decltype(test_vector_sizes), + decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)> #else csizes_t<1, 2> #endif @@ -255,9 +257,88 @@ struct bitmask }; template <typename T> -struct maskbit +constexpr inline T maskbits(bool value) +{ + return value ? special_constants<T>::allones() : special_constants<T>::allzeros(); +} + +template <typename T> +struct bit_value; + +template <typename T> +struct bit +{ + alignas(T) bool value; + bit() CMT_NOEXCEPT = default; + + constexpr bit(const bit_value<T>& value) CMT_NOEXCEPT : value(static_cast<bool>(value)) {} + + constexpr bit(T value) CMT_NOEXCEPT : value(bitcast_anything<itype<T>>(value) < 0) {} + constexpr bit(bool value) CMT_NOEXCEPT : value(value) {} + + template <typename U> + constexpr bit(const bit<U>& value) CMT_NOEXCEPT : value(value.value) + { + } + + constexpr operator bool() const CMT_NOEXCEPT { return value; } + constexpr explicit operator T() const CMT_NOEXCEPT { return maskbits<T>(value); } +}; + +template <typename T> +struct bit_value +{ + T value; + bit_value() CMT_NOEXCEPT = default; + + constexpr bit_value(const bit<T>& value) CMT_NOEXCEPT : bit_value(value.value) {} + + constexpr bit_value(T value) CMT_NOEXCEPT : value(value) {} + constexpr bit_value(bool value) CMT_NOEXCEPT : value(maskbits<T>(value)) {} + + template <typename U> + constexpr bit_value(const bit_value<U>& value) CMT_NOEXCEPT : bit_value(value.operator bool()) + { + } + + constexpr operator bool() const CMT_NOEXCEPT { return bitcast_anything<itype<T>>(value) < 0; } + constexpr explicit operator T() const CMT_NOEXCEPT { return value; } +}; + +template <typename T> +struct special_scalar_constants<bit<T>> +{ + constexpr static bit<T> highbitmask() { return true; } + constexpr static bit<T> allones() noexcept { return true; }; + constexpr static bit<T> allzeros() { return false; } + constexpr static bit<T> invhighbitmask() { return false; } +}; + +namespace internal_generic +{ +template <typename T> +struct unwrap_bit +{ + using type = T; +}; +template <typename T> +struct unwrap_bit<bit<T>> +{ + using type = T; +}; + +} // namespace internal_generic + +template <typename T> +using unwrap_bit = typename internal_generic::unwrap_bit<T>::type; + +template <typename T> +struct is_bit : cfalse_t +{ +}; +template <typename T> +struct is_bit<bit<T>> : ctrue_t { - bool value; }; namespace fn_generic @@ -340,6 +421,11 @@ struct is_simd_type { }; +template <typename T> +struct is_simd_type<bit<T>> : is_simd_type<T> +{ +}; + template <typename T, size_t N> struct vec_shape { diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp @@ -112,9 +112,6 @@ template <typename T, size_t N> struct vec; template <typename T, size_t N> -struct mask; - -template <typename T, size_t N> struct vec_halves { vec<T, prev_poweroftwo(N - 1)> low; @@ -250,15 +247,15 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< } // from vector of another type - template <typename U, - KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)> + template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && + (compound_type_traits<T>::is_scalar && !is_bit<U>::value))> KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT : v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v)) { } - template <typename U, - KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)> + template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && + !(compound_type_traits<T>::is_scalar && !is_bit<U>::value))> KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT : v(internal::conversion<vec<T, N>, vec<U, N>>::cast(x).v) { @@ -412,6 +409,11 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); } + KFR_MEM_INTRINSIC constexpr vec<unwrap_bit<T>, N> asvec() const CMT_NOEXCEPT + { + return vec<unwrap_bit<T>, N>(v); + } + constexpr static size_t simd_element_size = const_min(vector_width<T>, N); constexpr static size_t simd_element_count = N / simd_element_size; using simd_element_type = simd<ST, simd_element_size>; @@ -1152,7 +1154,18 @@ void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isap namespace internal { -// vector<vector> to vector<vector> +// vector to vector<vector> +template <typename To, typename From, size_t N> +struct conversion<vec<bit<To>, N>, vec<bit<From>, N>> +{ + static vec<bit<To>, N> cast(const vec<bit<From>, N>& value) + { + return vec<To, N>::frombits(innercast<itype<To>>(vec<itype<From>, N>::frombits(value.asvec()))) + .asmask(); + } +}; + +// vector to vector<vector> template <typename To, typename From, size_t N1, size_t N2, size_t Ns1> struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>> { diff --git a/sources.cmake b/sources.cmake @@ -136,6 +136,7 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specialconstants.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i ${PROJECT_SOURCE_DIR}/include/kfr/testo/assert.hpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt @@ -33,7 +33,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/") if (ENABLE_ASMTEST) add_executable(asm_test asm_test.cpp) target_link_libraries(asm_test kfr) - target_set_arch(asm_test PRIVATE avx2) + target_set_arch(asm_test PRIVATE sse2) target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") target_compile_options(asm_test PRIVATE -fno-stack-protector) @@ -153,9 +153,7 @@ if(USE_SDE) find_program(EMULATOR "sde") list(APPEND EMULATOR "-skx") list(APPEND EMULATOR "--") -elseif (ARM) - find_program(EMULATOR "qemu-arm") -else () +elseif (NOT EMULATOR) set(EMULATOR "") endif () diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp @@ -133,6 +133,13 @@ TEST(mix) }); } +TEST(expression_mask) +{ + univector<float> x(100); + univector<float> y(100); + x = select(x > y, 0.5f, 0.1f) * (y - x) + x; +} + constexpr inline size_t fast_range_sum(size_t stop) { return stop * (stop + 1) / 2; } TEST(partition) diff --git a/tests/unit/simd/operators.cpp b/tests/unit/simd/operators.cpp @@ -142,7 +142,7 @@ TEST(eq) { test_function2(test_catogories::vectors, [](auto x, auto y) { return (x == y).asvec(); }, [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { - return internal::maskbits<subtype<decltype(x)>>(x == y); + return maskbits<subtype<decltype(x)>>(x == y); }); } @@ -150,7 +150,7 @@ TEST(ne) { test_function2(test_catogories::vectors, [](auto x, auto y) { return (x != y).asvec(); }, [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { - return internal::maskbits<subtype<decltype(x)>>(x != y); + return maskbits<subtype<decltype(x)>>(x != y); }); } @@ -158,7 +158,7 @@ TEST(ge) { test_function2(test_catogories::vectors, [](auto x, auto y) { return (x >= y).asvec(); }, [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { - return internal::maskbits<subtype<decltype(x)>>(x >= y); + return maskbits<subtype<decltype(x)>>(x >= y); }); } @@ -166,7 +166,7 @@ TEST(le) { test_function2(test_catogories::vectors, [](auto x, auto y) { return (x <= y).asvec(); }, [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { - return internal::maskbits<subtype<decltype(x)>>(x <= y); + return maskbits<subtype<decltype(x)>>(x <= y); }); } @@ -174,7 +174,7 @@ TEST(gt) { test_function2(test_catogories::vectors, [](auto x, auto y) { return (x > y).asvec(); }, [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { - return internal::maskbits<subtype<decltype(x)>>(x > y); + return maskbits<subtype<decltype(x)>>(x > y); }); } @@ -182,7 +182,7 @@ TEST(lt) { test_function2(test_catogories::vectors, [](auto x, auto y) { return (x < y).asvec(); }, [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { - return internal::maskbits<subtype<decltype(x)>>(x < y); + return maskbits<subtype<decltype(x)>>(x < y); }); } diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp @@ -124,11 +124,31 @@ TEST(unaligned_read) for (size_t i = 0; i < N; i++) { -// testo::scope sc(as_string("i = ", i)); + testo::scope sc(as_string("i = ", i)); CHECK(read<N, false>(data + i) == (enumerate<Tsub, N>() + static_cast<Tsub>(i))); } }); } +TEST(mask_broadcast) +{ + CHECK(mask<i32, 4>(mask<f32, 4>(true, false, true, false)).asvec() == vec<i32, 4>(-1, 0, -1, 0)); + CHECK(mask<i32, 4>(mask<f32, 4>(true)).asvec() == vec<i32, 4>(-1, -1, -1, -1)); +} + +TEST(masks) +{ + mask<float, 4> m = make_mask<float>(false, true, false, true); + vec<float, 4> v = m.asvec(); + CHECK(bit<float>(m[0]) == false); + CHECK(bit<float>(m[1]) == true); + CHECK(bit<float>(m[2]) == false); + CHECK(bit<float>(m[3]) == true); + CHECK(float(v[0]) == maskbits<float>(false)); + CHECK(float(v[1]) == maskbits<float>(true)); + CHECK(float(v[2]) == maskbits<float>(false)); + CHECK(float(v[3]) == maskbits<float>(true)); +} + } // namespace CMT_ARCH_NAME } // namespace kfr