commit 320a0bf21df43c4494344571e66570d8a8dcb684
parent c6074e188884ca756adfef7b8b7ea0fd1d73b5f8
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Mon, 11 Mar 2019 15:54:04 +0000
make mask<> a specialization of vec<>
Diffstat:
25 files changed, 589 insertions(+), 418 deletions(-)
diff --git a/cmake/aarch64.cmake b/cmake/aarch64.cmake
@@ -0,0 +1,37 @@
+# For internal use only
+
+set (CMAKE_SYSTEM_NAME Linux)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (ARM True)
+set (AARCH64 True)
+set (CMAKE_SYSTEM_PROCESSOR aarch64)
+set (EMULATOR qemu-aarch64)
+
+include (CMakeForceCompiler)
+CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Clang)
+CMAKE_FORCE_C_COMPILER (/usr/bin/clang Clang)
+set (CMAKE_CXX_COMPILER_WORKS TRUE)
+set (CMAKE_C_COMPILER_WORKS TRUE)
+
+set(TGT_TRIPLET aarch64-linux-gnu)
+
+set (ARM_ROOT "/usr/${TGT_TRIPLET}/include")
+if (NOT GCC_VER)
+ set (GCC_VER 5.4.0)
+endif ()
+set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/${TGT_TRIPLET} -isystem ${ARM_ROOT}")
+
+set (ARM_COMMON_FLAGS "-target ${TGT_TRIPLET} -mcpu=cortex-a72 -static")
+
+set (CMAKE_CXX_FLAGS "${SYS_PATHS} ${ARM_COMMON_FLAGS}")
+set (CMAKE_C_FLAGS " ${SYS_PATHS} ${ARM_COMMON_FLAGS}")
+
+set (CMAKE_CXX_LINK_FLAGS " ${ARM_COMMON_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
+set (CMAKE_C_LINK_FLAGS " ${ARM_COMMON_FLAGS} ${CMAKE_C_LINK_FLAGS}")
+
+message(STATUS "${ARM_COMMON_FLAGS}")
+
+set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/cmake/arm.cmake b/cmake/arm.cmake
@@ -1,8 +1,11 @@
+# For internal use only
+
set (CMAKE_SYSTEM_NAME Linux)
set (CMAKE_SYSTEM_VERSION 1)
set (UNIX True)
set (ARM True)
set (CMAKE_SYSTEM_PROCESSOR arm)
+set (EMULATOR qemu-arm)
include (CMakeForceCompiler)
CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Clang)
@@ -10,13 +13,15 @@ CMAKE_FORCE_C_COMPILER (/usr/bin/clang Clang)
set (CMAKE_CXX_COMPILER_WORKS TRUE)
set (CMAKE_C_COMPILER_WORKS TRUE)
-set (ARM_ROOT "/usr/arm-linux-gnueabihf/include")
+set(TGT_TRIPLET arm-linux-gnueabihf)
+
+set (ARM_ROOT "/usr/${TGT_TRIPLET}/include")
if (NOT GCC_VER)
set (GCC_VER 5.4.0)
endif ()
-set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/arm-linux-gnueabihf -isystem ${ARM_ROOT}")
+set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/${TGT_TRIPLET} -isystem ${ARM_ROOT}")
-set (ARM_COMMON_FLAGS "-target arm-linux-gnueabihf -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static")
+set (ARM_COMMON_FLAGS "-target ${TGT_TRIPLET} -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static")
set (CMAKE_CXX_FLAGS "${SYS_PATHS} ${ARM_COMMON_FLAGS}")
set (CMAKE_C_FLAGS " ${SYS_PATHS} ${ARM_COMMON_FLAGS}")
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -141,7 +141,7 @@ extern char* gets(char* __s);
#define CMT_ARCH_BITNESS_NAME "32-bit"
#endif
-#ifdef __ARM_NEON__
+#if defined __ARM_NEON__ || defined __ARM_NEON
#if __ARM_ARCH >= 8 && defined(__aarch64__)
#define CMT_ARCH_NEON64 1
diff --git a/include/kfr/cometa/numeric.hpp b/include/kfr/cometa/numeric.hpp
@@ -142,7 +142,7 @@ using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>;
template <typename T>
struct typebits
{
- static_assert(is_number<deep_subtype<T>>::value, "");
+ // static_assert(is_number<deep_subtype<T>>::value, "");
constexpr static size_t bits = sizeof(typename compound_type_traits<T>::subtype) * 8;
constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width;
using subtype = typename compound_type_traits<T>::subtype;
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -131,7 +131,7 @@ struct expression_rectangular : input_expression
size_t index, vec_shape<T, N>)
{
using TI = utype<T>;
- const vec<TI, N> i = enumerate(vec<TI, N>()) + static_cast<TI>(index);
+ const vec<TI, N> i = enumerate(vec_shape<TI, N>()) + static_cast<TI>(index);
return select(i < static_cast<TI>(self.m_size), T(1), T(0));
}
size_t size() const { return m_size; }
diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp
@@ -39,100 +39,106 @@ namespace intrinsics
#if defined CMT_ARCH_SSE41
// horizontal OR
-KFR_INTRINSIC bool bittestany(const u8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u64sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); }
// horizontal AND
-KFR_INTRINSIC bool bittestall(const u8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
#endif
#if defined CMT_ARCH_AVX
// horizontal OR
-KFR_INTRINSIC bool bittestany(const f32sse& x) { return !_mm_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const f64sse& x) { return !_mm_testz_pd(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const f32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const f64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
// horizontal AND
-KFR_INTRINSIC bool bittestall(const f32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const f64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const f32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const f64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
#if defined CMT_ARCH_AVX512
// horizontal OR
-KFR_INTRINSIC bool bittestany(const f32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const f64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const u8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const u16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const u32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const u64avx512& x) { return _mm512_movepi64_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); }
// horizontal AND
-KFR_INTRINSIC bool bittestall(const f32avx512& x) { return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
-KFR_INTRINSIC bool bittestall(const f64avx512& x) { return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
-KFR_INTRINSIC bool bittestall(const u8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const u16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const u32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const u64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const i8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const i16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const i32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const i64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mf32avx512& x)
+{
+ return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mf64avx512& x)
+{
+ return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
#endif
#elif defined CMT_ARCH_SSE41
-KFR_INTRINSIC bool bittestany(const f32sse& x)
+KFR_INTRINSIC bool bittestany(const mf32sse& x)
{
return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
}
-KFR_INTRINSIC bool bittestany(const f64sse& x)
+KFR_INTRINSIC bool bittestany(const mf64sse& x)
{
return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
}
-KFR_INTRINSIC bool bittestall(const f32sse& x)
+KFR_INTRINSIC bool bittestall(const mf32sse& x)
{
return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
}
-KFR_INTRINSIC bool bittestall(const f64sse& x)
+KFR_INTRINSIC bool bittestall(const mf64sse& x)
{
return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
}
@@ -140,100 +146,100 @@ KFR_INTRINSIC bool bittestall(const f64sse& x)
#if !defined CMT_ARCH_SSE41
-KFR_INTRINSIC bool bittestany(const f32sse& x) { return _mm_movemask_ps(x.v); }
-KFR_INTRINSIC bool bittestany(const f64sse& x) { return _mm_movemask_pd(x.v); }
-KFR_INTRINSIC bool bittestany(const u8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const u16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const u32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const u64sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i64sse& x) { return _mm_movemask_epi8(x.v); }
-
-KFR_INTRINSIC bool bittestall(const f32sse& x) { return !_mm_movemask_ps((~x).v); }
-KFR_INTRINSIC bool bittestall(const f64sse& x) { return !_mm_movemask_pd((~x).v); }
-KFR_INTRINSIC bool bittestall(const u8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const u16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const u32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const u64sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); }
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); }
+
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); }
#endif
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
{
- return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+ return bittestall(expand_simd(a, bit<T>(true)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
{
return bittestall(low(a)) && bittestall(high(a));
}
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
{
- return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+ return bittestany(expand_simd(a, bit<T>(false)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
{
return bittestany(low(a)) || bittestany(high(a));
}
#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-KFR_INTRINSIC bool bittestall(const u32neon& a)
+KFR_INTRINSIC bool bittestall(const mu32neon& a)
{
const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
}
-KFR_INTRINSIC bool bittestany(const u32neon& a)
+KFR_INTRINSIC bool bittestany(const mu32neon& a)
{
const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
}
-KFR_INTRINSIC bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); }
-
-KFR_INTRINSIC bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<u32>(a)); }
+
+KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<u32>(a)); }
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
{
- return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+ return bittestall(expand_simd(a, bit<T>(true)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
{
return bittestall(low(a)) && bittestall(high(a));
}
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
{
- return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+ return bittestany(expand_simd(a, bit<T>(false)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
{
return bittestany(low(a)) || bittestany(high(a));
}
@@ -241,34 +247,34 @@ KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
#else
template <typename T, size_t N>
-KFR_INTRINSIC bitmask<N> getmask(const vec<T, N>& x)
+KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x)
{
typename bitmask<N>::type val = 0;
for (size_t i = 0; i < N; i++)
{
- val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
+ val |= static_cast<int>(x[i]) << i;
}
return val;
}
template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& x)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x)
{
return getmask(x).value;
}
template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y)
{
return bittestany(x & y);
}
template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& x)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x)
{
return !getmask(~x).value;
}
template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y)
{
return !bittestany(~x & y);
}
diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp
@@ -35,265 +35,262 @@ namespace intrinsics
#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-KFR_INTRINSIC u8sse select(const u8sse& m, const u8sse& x, const u8sse& y)
+KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC u16sse select(const u16sse& m, const u16sse& x, const u16sse& y)
+KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC u32sse select(const u32sse& m, const u32sse& x, const u32sse& y)
+KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC u64sse select(const u64sse& m, const u64sse& x, const u64sse& y)
+KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i8sse select(const i8sse& m, const i8sse& x, const i8sse& y)
+KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i16sse select(const i16sse& m, const i16sse& x, const i16sse& y)
+KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i32sse select(const i32sse& m, const i32sse& x, const i32sse& y)
+KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i64sse select(const i64sse& m, const i64sse& x, const i64sse& y)
+KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y)
{
return _mm_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC f32sse select(const f32sse& m, const f32sse& x, const f32sse& y)
+KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y)
{
return _mm_blendv_ps(y.v, x.v, m.v);
}
-KFR_INTRINSIC f64sse select(const f64sse& m, const f64sse& x, const f64sse& y)
+KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
{
return _mm_blendv_pd(y.v, x.v, m.v);
}
#if defined CMT_ARCH_AVX
-KFR_INTRINSIC f64avx select(const f64avx& m, const f64avx& x, const f64avx& y)
+KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
{
return _mm256_blendv_pd(y.v, x.v, m.v);
}
-KFR_INTRINSIC f32avx select(const f32avx& m, const f32avx& x, const f32avx& y)
+KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
{
return _mm256_blendv_ps(y.v, x.v, m.v);
}
#endif
#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx select(const u8avx& m, const u8avx& x, const u8avx& y)
+KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC u16avx select(const u16avx& m, const u16avx& x, const u16avx& y)
+KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC u32avx select(const u32avx& m, const u32avx& x, const u32avx& y)
+KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC u64avx select(const u64avx& m, const u64avx& x, const u64avx& y)
+KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i8avx select(const i8avx& m, const i8avx& x, const i8avx& y)
+KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i16avx select(const i16avx& m, const i16avx& x, const i16avx& y)
+KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i32avx select(const i32avx& m, const i32avx& x, const i32avx& y)
+KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
-KFR_INTRINSIC i64avx select(const i64avx& m, const i64avx& x, const i64avx& y)
+KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
{
return _mm256_blendv_epi8(y.v, x.v, m.v);
}
#endif
#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC f64avx512 select(const f64avx512& m, const f64avx512& x, const f64avx512& y)
+KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y)
{
return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
}
-KFR_INTRINSIC f32avx512 select(const f32avx512& m, const f32avx512& x, const f32avx512& y)
+KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y)
{
return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
}
-KFR_INTRINSIC u8avx512 select(const u8avx512& m, const u8avx512& x, const u8avx512& y)
+KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y)
{
return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
}
-KFR_INTRINSIC u16avx512 select(const u16avx512& m, const u16avx512& x, const u16avx512& y)
+KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y)
{
return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
}
-KFR_INTRINSIC u32avx512 select(const u32avx512& m, const u32avx512& x, const u32avx512& y)
+KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y)
{
return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
}
-KFR_INTRINSIC u64avx512 select(const u64avx512& m, const u64avx512& x, const u64avx512& y)
+KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y)
{
return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
}
-KFR_INTRINSIC i8avx512 select(const i8avx512& m, const i8avx512& x, const i8avx512& y)
+KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y)
{
return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
}
-KFR_INTRINSIC i16avx512 select(const i16avx512& m, const i16avx512& x, const i16avx512& y)
+KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y)
{
return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
}
-KFR_INTRINSIC i32avx512 select(const i32avx512& m, const i32avx512& x, const i32avx512& y)
+KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y)
{
return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
}
-KFR_INTRINSIC i64avx512 select(const i64avx512& m, const i64avx512& x, const i64avx512& y)
+KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y)
{
return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
}
#endif
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
{
constexpr size_t Nout = next_simd_width<T>(N);
return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
.shuffle(csizeseq<N>);
}
template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
{
- vec<T, N> r;
- intrin(r, a, b, c, [](auto x, auto y, auto z) { return intrinsics::select(x, y, z); });
- return r;
- // return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
+ return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
// return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
}
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
{
constexpr size_t Nout = next_simd_width<T>(N);
return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
}
template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
{
return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
}
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
{
constexpr size_t Nout = next_simd_width<T>(N);
return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
}
template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
{
return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
}
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
{
constexpr size_t Nout = next_simd_width<T>(N);
return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
.shuffle(csizeseq<N>);
}
template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
{
return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
}
#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-KFR_INTRINSIC f32neon select(const f32neon& m, const f32neon& x, const f32neon& y)
+KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
{
return vbslq_f32(m.v, x.v, y.v);
}
-KFR_INTRINSIC i8neon select(const i8neon& m, const i8neon& x, const i8neon& y)
+KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y)
{
return vbslq_s8(m.v, x.v, y.v);
}
-KFR_INTRINSIC u8neon select(const u8neon& m, const u8neon& x, const u8neon& y)
+KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y)
{
return vbslq_u8(m.v, x.v, y.v);
}
-KFR_INTRINSIC i16neon select(const i16neon& m, const i16neon& x, const i16neon& y)
+KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
{
return vbslq_s16(m.v, x.v, y.v);
}
-KFR_INTRINSIC u16neon select(const u16neon& m, const u16neon& x, const u16neon& y)
+KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
{
return vbslq_u16(m.v, x.v, y.v);
}
-KFR_INTRINSIC i32neon select(const i32neon& m, const i32neon& x, const i32neon& y)
+KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
{
return vbslq_s32(m.v, x.v, y.v);
}
-KFR_INTRINSIC u32neon select(const u32neon& m, const u32neon& x, const u32neon& y)
+KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
{
return vbslq_u32(m.v, x.v, y.v);
}
-KFR_INTRINSIC i64neon select(const i64neon& m, const i64neon& x, const i64neon& y)
+KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
{
return vbslq_s64(m.v, x.v, y.v);
}
-KFR_INTRINSIC u64neon select(const u64neon& m, const u64neon& x, const u64neon& y)
+KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
{
return vbslq_u64(m.v, x.v, y.v);
}
#ifdef CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
{
return vbslq_f64(m.v, x.v, y.v);
}
#else
-KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
{
- return y ^ ((x ^ y) & m);
+ return y ^ ((x ^ y) & m.asvec());
}
#endif
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
{
constexpr size_t Nout = next_simd_width<T>(N);
return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
.shuffle(csizeseq<N>);
}
template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
{
return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
}
template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
{
return select(m, vec<T, N>(x), vec<T, N>(y));
}
template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
{
return select(m, x, vec<T, N>(y));
}
template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
{
return select(m, vec<T, N>(x), y);
}
@@ -302,22 +299,22 @@ KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>&
// fallback
template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y)
{
- return y ^ ((x ^ y) & m);
+ return y ^ ((x ^ y) & m.asvec());
}
template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
{
return select(m, vec<T, N>(x), vec<T, N>(y));
}
template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
{
return select(m, x, vec<T, N>(y));
}
template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
{
return select(m, vec<T, N>(x), y);
}
diff --git a/include/kfr/math/logical.hpp b/include/kfr/math/logical.hpp
@@ -39,7 +39,7 @@ inline namespace CMT_ARCH_NAME
template <typename T, size_t N>
KFR_INTRINSIC bool all(const mask<T, N>& x)
{
- return intrinsics::bittestall(x.asvec());
+ return intrinsics::bittestall(x);
}
/**
@@ -48,7 +48,7 @@ KFR_INTRINSIC bool all(const mask<T, N>& x)
template <typename T, size_t N>
KFR_INTRINSIC bool any(const mask<T, N>& x)
{
- return intrinsics::bittestany(x.asvec());
+ return intrinsics::bittestany(x);
}
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/math/select.hpp b/include/kfr/math/select.hpp
@@ -43,7 +43,7 @@ template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_nume
KFR_INTRINSIC vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
{
static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
- return intrinsics::select(bitcast<Tout>(m.asvec()), innercast<Tout>(x), innercast<Tout>(y));
+ return intrinsics::select(bitcast<Tout>(m.asvec()).asmask(), innercast<Tout>(x), innercast<Tout>(y));
}
/**
diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp
@@ -36,7 +36,7 @@ namespace intrinsics
{
template <typename TT, size_t NN>
-using simd = TT __attribute__((ext_vector_type(NN)));
+using simd = unwrap_bit<TT> __attribute__((ext_vector_type(NN)));
template <typename T, size_t N1>
KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x);
@@ -51,13 +51,13 @@ KFR_INTRINSIC void simd_make(ctype_t<Tout>) = delete;
template <typename Tout, typename Arg>
KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg)
{
- return (simd<Tout, 1>){ static_cast<Tout>(arg) };
+ return (simd<Tout, 1>){ static_cast<unwrap_bit<Tout>>(arg) };
}
template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)>
KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args)
{
- return (simd<Tout, N>){ static_cast<Tout>(args)... };
+ return (simd<Tout, N>){ static_cast<unwrap_bit<Tout>>(args)... };
}
/// @brief Returns vector with undefined value
@@ -111,7 +111,7 @@ KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x)
template <typename T, size_t N>
KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value)
{
- return value;
+ return static_cast<unwrap_bit<T>>(value);
}
template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp
@@ -49,7 +49,7 @@ namespace intrinsics
{
template <typename T, size_t N>
-using simd = typename simd_type<T, N>::type;
+using simd = typename simd_type<unwrap_bit<T>, N>::type;
template <typename T, size_t N, typename U>
union simd_small_array {
@@ -767,7 +767,7 @@ KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
template <typename T, size_t N, size_t... indices>
KFR_INTRINSIC simd<T, N> from_simd_array_impl(const simd_array<T, N>& x, csizes_t<indices...>) CMT_NOEXCEPT
{
- return { x.val[indices]... };
+ return { static_cast<unwrap_bit<T>>(x.val[indices])... };
}
template <typename T, size_t N, KFR_ENABLE_IF(is_simd_small_array<simd<T, N>>::value)>
@@ -806,7 +806,7 @@ KFR_INTRINSIC void simd_make(ctype_t<Tout>) CMT_NOEXCEPT = delete;
template <typename Tout, typename Arg>
KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) CMT_NOEXCEPT
{
- return simd<Tout, 1>{ static_cast<Tout>(arg) };
+ return simd<Tout, 1>{ static_cast<unwrap_bit<Tout>>(static_cast<Tout>(arg)) };
}
template <typename T, size_t... indices, typename... Args, size_t N = sizeof...(indices)>
@@ -931,7 +931,7 @@ KFR_INTRINSIC simd<T, N + N> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x,
template <typename T>
KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_t<T, 1>, identity<T> value) CMT_NOEXCEPT
{
- return { value };
+ return { static_cast<unwrap_bit<T>>(value) };
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= 2), size_t Nlow = prev_poweroftwo(N - 1)>
@@ -964,7 +964,7 @@ simd_array<T, Nout> simd_shuffle_generic(const simd_array<T, N>& x, const unsign
for (size_t i = 0; i < Nout; ++i)
{
const size_t index = indices[i];
- result.val[i] = index >= N ? T() : x.val[index];
+ result.val[i] = index >= N ? T() : static_cast<T>(x.val[index]);
}
return result;
}
@@ -977,7 +977,9 @@ simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd
for (size_t i = 0; i < Nout; ++i)
{
const size_t index = indices[i];
- result.val[i] = index > N1 + N2 ? T() : index >= N1 ? y.val[index - N1] : x.val[index];
+ result.val[i] = index >= N1 + N2
+ ? T()
+ : index >= N1 ? static_cast<T>(y.val[index - N1]) : static_cast<T>(x.val[index]);
}
return result;
}
@@ -992,7 +994,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csiz
constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, N>(xx, indices_array));
#else
- return from_simd_array<T, Nout>({ (indices > N ? T() : to_simd_array<T, N>(x).val[indices])... });
+ return from_simd_array<T, Nout>(
+ { (indices >= N ? T() : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... });
#endif
}
@@ -1009,9 +1012,9 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x,
return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N, N>(xx, yy, indices_array));
#else
return from_simd_array<T, Nout>(
- { (indices > N * 2 ? T()
- : indices >= N ? to_simd_array<T, N>(y).val[indices - N]
- : to_simd_array<T, N>(x).val[indices])... });
+ { (indices >= N * 2 ? T()
+ : indices >= N ? static_cast<T>(to_simd_array<T, N>(y).val[indices - N])
+ : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... });
#endif
}
@@ -1031,8 +1034,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>&
return from_simd_array<T, Nout>(
{ (indices > N1 + N2 ? T()
- : indices >= N1 ? to_simd_array<T, N2>(y).val[indices - N1]
- : to_simd_array<T, N1>(x).val[indices])... });
+ : indices >= N1 ? static_cast<T>(to_simd_array<T, N2>(y).val[indices - N1])
+ : static_cast<T>(to_simd_array<T, N1>(x).val[indices]))... });
#endif
}
diff --git a/include/kfr/simd/impl/basicoperators_generic.hpp b/include/kfr/simd/impl/basicoperators_generic.hpp
@@ -412,19 +412,19 @@ KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) { return _mm_not_si128
#else
KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y)
{
- KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] == y[i]));
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] == y[i]));
}
KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y)
{
- KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] == y[i]));
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] == y[i]));
}
KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y)
{
- KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] != y[i]));
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] != y[i]));
}
KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y)
{
- KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] != y[i]));
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] != y[i]));
}
#endif
@@ -458,35 +458,35 @@ KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
#else
KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y)
{
- KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] > y[i]));
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] > y[i]));
}
KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y)
{
- KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] > y[i]));
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] > y[i]));
}
KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y)
{
- KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] < y[i]));
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] < y[i]));
}
KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y)
{
- KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] < y[i]));
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] < y[i]));
}
KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y)
{
- KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] >= y[i]));
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] >= y[i]));
}
KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y)
{
- KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] >= y[i]));
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] >= y[i]));
}
KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
{
- KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] <= y[i]));
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] <= y[i]));
}
KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y)
{
- KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] <= y[i]));
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] <= y[i]));
}
#endif
@@ -1557,32 +1557,32 @@ KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y)
template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y)
{
- KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] == y[i]));
+ KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] == y[i]));
}
template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y)
{
- KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] != y[i]));
+ KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] != y[i]));
}
template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y)
{
- KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] >= y[i]));
+ KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] >= y[i]));
}
template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y)
{
- KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] <= y[i]));
+ KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] <= y[i]));
}
template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y)
{
- KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] > y[i]));
+ KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] > y[i]));
}
template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y)
{
- KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] < y[i]));
+ KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] < y[i]));
}
template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
diff --git a/include/kfr/simd/impl/function.hpp b/include/kfr/simd/impl/function.hpp
@@ -105,6 +105,39 @@ using u16avx512 = vec<u16, 32>;
using u32avx512 = vec<u32, 16>;
using u64avx512 = vec<u64, 8>;
+using mf32sse = mask<f32, 4>;
+using mf64sse = mask<f64, 2>;
+using mi8sse = mask<i8, 16>;
+using mi16sse = mask<i16, 8>;
+using mi32sse = mask<i32, 4>;
+using mi64sse = mask<i64, 2>;
+using mu8sse = mask<u8, 16>;
+using mu16sse = mask<u16, 8>;
+using mu32sse = mask<u32, 4>;
+using mu64sse = mask<u64, 2>;
+
+using mf32avx = mask<f32, 8>;
+using mf64avx = mask<f64, 4>;
+using mi8avx = mask<i8, 32>;
+using mi16avx = mask<i16, 16>;
+using mi32avx = mask<i32, 8>;
+using mi64avx = mask<i64, 4>;
+using mu8avx = mask<u8, 32>;
+using mu16avx = mask<u16, 16>;
+using mu32avx = mask<u32, 8>;
+using mu64avx = mask<u64, 4>;
+
+using mf32avx512 = mask<f32, 16>;
+using mf64avx512 = mask<f64, 8>;
+using mi8avx512 = mask<i8, 64>;
+using mi16avx512 = mask<i16, 32>;
+using mi32avx512 = mask<i32, 16>;
+using mi64avx512 = mask<i64, 8>;
+using mu8avx512 = mask<u8, 64>;
+using mu16avx512 = mask<u16, 32>;
+using mu32avx512 = mask<u32, 16>;
+using mu64avx512 = mask<u64, 8>;
+
#else
using f32neon = vec<f32, 4>;
using f64neon = vec<f64, 2>;
@@ -116,6 +149,17 @@ using u8neon = vec<u8, 16>;
using u16neon = vec<u16, 8>;
using u32neon = vec<u32, 4>;
using u64neon = vec<u64, 2>;
+
+using mf32neon = mask<f32, 4>;
+using mf64neon = mask<f64, 2>;
+using mi8neon = mask<i8, 16>;
+using mi16neon = mask<i16, 8>;
+using mi32neon = mask<i32, 4>;
+using mi64neon = mask<i64, 2>;
+using mu8neon = mask<u8, 16>;
+using mu16neon = mask<u16, 8>;
+using mu32neon = mask<u32, 4>;
+using mu64neon = mask<u64, 2>;
#endif
template <typename T>
diff --git a/include/kfr/simd/impl/simd.hpp b/include/kfr/simd/impl/simd.hpp
@@ -30,75 +30,6 @@ namespace kfr
inline namespace CMT_ARCH_NAME
{
-#if defined CMT_COMPILER_GNU
-constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); }
-constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); }
-constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); }
-constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); }
-#elif defined CMT_COMPILER_MSVC
-constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); }
-constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); }
-constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); }
-constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); }
-#else
-inline f32 allones_f32() CMT_NOEXCEPT
-{
- return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu)));
-}
-inline f64 allones_f64() CMT_NOEXCEPT
-{
- return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull)));
-}
-inline f32 invhighbit_f32() CMT_NOEXCEPT
-{
- return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu)));
-}
-inline f64 invhighbit_f64() CMT_NOEXCEPT
-{
- return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull)));
-}
-#endif
-
-template <typename T>
-struct special_scalar_constants
-{
- constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); }
- constexpr static T allones() { return static_cast<T>(-1ll); }
- constexpr static T allzeros() { return T(0); }
- constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); }
-};
-
-#ifndef CMT_COMPILER_INTEL
-#define KFR_CONSTEXPR_NON_INTEL constexpr
-#else
-#define KFR_CONSTEXPR_NON_INTEL
-#endif
-
-template <>
-struct special_scalar_constants<float>
-{
- constexpr static float highbitmask() { return -0.f; }
- KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); };
- constexpr static float allzeros() { return 0.f; }
- KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); }
-};
-
-template <>
-struct special_scalar_constants<double>
-{
- constexpr static double highbitmask() { return -0.; }
- KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); };
- constexpr static double allzeros() { return 0.; }
- KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); }
-};
-
-template <typename T>
-struct special_constants : public special_scalar_constants<subtype<T>>
-{
-public:
- using Tsub = subtype<T>;
-};
-
namespace intrinsics
{
@@ -142,6 +73,12 @@ struct alignas(alignment<T, N>()) simd_array
};
template <typename T, size_t N>
+struct alignas(alignment<T, N>()) simd_array<bit<T>, N>
+{
+ bit_value<T> val[next_poweroftwo(N)];
+};
+
+template <typename T, size_t N>
struct simd_type;
template <typename T>
diff --git a/include/kfr/simd/impl/specialconstants.hpp b/include/kfr/simd/impl/specialconstants.hpp
@@ -0,0 +1,101 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../cometa/numeric.hpp"
+#include "intrinsics.h"
+
+namespace kfr
+{
+using namespace cometa;
+
+#if defined CMT_COMPILER_GNU
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); }
+#elif defined CMT_COMPILER_MSVC
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); }
+#else
+inline f32 allones_f32() CMT_NOEXCEPT
+{
+ return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu)));
+}
+inline f64 allones_f64() CMT_NOEXCEPT
+{
+ return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull)));
+}
+inline f32 invhighbit_f32() CMT_NOEXCEPT
+{
+ return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu)));
+}
+inline f64 invhighbit_f64() CMT_NOEXCEPT
+{
+ return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull)));
+}
+#endif
+
+template <typename T>
+struct special_scalar_constants
+{
+ constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); }
+ constexpr static T allones() { return static_cast<T>(-1ll); }
+ constexpr static T allzeros() { return T(0); }
+ constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); }
+};
+
+#ifndef CMT_COMPILER_INTEL
+#define KFR_CONSTEXPR_NON_INTEL constexpr
+#else
+#define KFR_CONSTEXPR_NON_INTEL
+#endif
+
+template <>
+struct special_scalar_constants<float>
+{
+ constexpr static float highbitmask() { return -0.f; }
+ KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); };
+ constexpr static float allzeros() { return 0.f; }
+ KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); }
+};
+
+template <>
+struct special_scalar_constants<double>
+{
+ constexpr static double highbitmask() { return -0.; }
+ KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); };
+ constexpr static double allzeros() { return 0.; }
+ KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); }
+};
+
+template <typename T>
+struct special_constants : public special_scalar_constants<subtype<T>>
+{
+public:
+ using Tsub = subtype<T>;
+};
+
+} // namespace kfr
diff --git a/include/kfr/simd/mask.hpp b/include/kfr/simd/mask.hpp
@@ -39,60 +39,6 @@ using maskfor = typename T::mask_t;
namespace internal
{
-template <typename T>
-constexpr inline T maskbits(bool value)
-{
- return value ? special_constants<T>::allones() : special_constants<T>::allzeros();
-}
-} // namespace internal
-
-template <typename T, size_t N>
-struct mask : protected vec<T, N>
-{
- using base = vec<T, N>;
-
- KFR_MEM_INTRINSIC mask() CMT_NOEXCEPT = default;
-
- KFR_MEM_INTRINSIC mask(const mask&) CMT_NOEXCEPT = default;
-
- KFR_MEM_INTRINSIC mask& operator=(const mask&) CMT_NOEXCEPT = default;
-
- using simd_type = typename base::simd_type;
-
- KFR_MEM_INTRINSIC mask(bool arg) : base(internal::maskbits<T>(arg)) {}
-
- template <typename... Args>
- KFR_MEM_INTRINSIC mask(bool arg1, bool arg2, Args... args)
- : base(internal::maskbits<T>(arg1), internal::maskbits<T>(arg2),
- internal::maskbits<T>(static_cast<bool>(args))...)
- {
- }
-
- using vec<T, N>::v;
-
- KFR_MEM_INTRINSIC mask(const base& v) CMT_NOEXCEPT;
-
- KFR_MEM_INTRINSIC mask(const simd_type& simd) : base(simd) {}
-
- template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
- KFR_MEM_INTRINSIC mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
- {
- }
-
- template <typename U, KFR_ENABLE_IF(sizeof(T) != sizeof(U))>
- KFR_MEM_INTRINSIC mask(const mask<U, N>& m)
- : base(base::frombits(innercast<itype<T>>(vec<itype<U>, N>::frombits(m.asvec()))))
- {
- }
-
- KFR_MEM_INTRINSIC bool operator[](size_t index) const CMT_NOEXCEPT;
-
- KFR_MEM_INTRINSIC constexpr base asvec() const CMT_NOEXCEPT { return base(v); }
-};
-
-namespace internal
-{
-
template <typename T, size_t Nout, size_t N1, size_t... indices>
constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>)
{
@@ -106,50 +52,11 @@ constexpr vec<T, Nout> partial_mask()
}
} // namespace internal
-template <typename T, size_t N>
-KFR_MEM_INTRINSIC bool mask<T, N>::operator[](size_t index) const CMT_NOEXCEPT
-{
- return ibitcast(base::operator[](index)) < 0;
-}
-
template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)>
-constexpr KFR_INTRINSIC mask<T, Nout> make_mask(bool arg, Args... args)
+constexpr KFR_INTRINSIC vec<bit<T>, Nout> make_mask(bool arg, Args... args)
{
- return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...);
+ return vec<bit<T>, Nout>(arg, static_cast<bool>(args)...);
}
} // namespace CMT_ARCH_NAME
} // namespace kfr
-
-namespace cometa
-{
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::mask<T, N>>
-{
- using subtype = T;
- using deep_subtype = cometa::deep_subtype<T>;
- constexpr static size_t width = N;
- constexpr static size_t deep_width = width * compound_type_traits<T>::width;
- constexpr static bool is_scalar = false;
- constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
- template <typename U>
- using rebind = kfr::mask<U, N>;
- template <typename U>
- using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-
- KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::mask<T, N>& value, size_t index)
- {
- return value[index];
- }
-};
-} // namespace cometa
-
-namespace std
-{
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
-{
- using type = kfr::mask<typename common_type<T1, T2>::type, N>;
-};
-} // namespace std
diff --git a/include/kfr/simd/operators.hpp b/include/kfr/simd/operators.hpp
@@ -800,11 +800,13 @@ vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<
KFR_FN(packtranspose)
+#if 0
template <typename T, size_t N>
-KFR_I_CE mask<T, N>::mask(const base& v) CMT_NOEXCEPT
+KFR_I_CE vec<bit<T>, N>::vec(const base& v) CMT_NOEXCEPT
{
this->v = base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()).v;
}
+#endif
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp
@@ -161,6 +161,8 @@ struct platform<cpu_t::common>
constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
constexpr static bool fast_unaligned = false;
+
+ constexpr static bool mask_registers = false;
};
template <>
struct platform<cpu_t::sse2> : platform<cpu_t::common>
@@ -208,6 +210,8 @@ struct platform<cpu_t::avx512> : platform<cpu_t::avx2>
constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
constexpr static size_t simd_register_count = bitness_const(8, 32);
+
+ constexpr static bool mask_registers = true;
};
#endif
#ifdef CMT_ARCH_ARM
@@ -234,6 +238,8 @@ struct platform<cpu_t::common>
constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
constexpr static bool fast_unaligned = false;
+
+ constexpr static bool mask_registers = false;
};
template <>
struct platform<cpu_t::neon> : platform<cpu_t::common>
@@ -279,8 +285,9 @@ constexpr static bool is_simd_size(size_t size)
template <typename T, size_t N = vector_width<T>>
struct vec;
+
template <typename T, size_t N = vector_width<T>>
-struct mask;
+using mask = vec<bit<T>, N>;
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp
@@ -28,6 +28,7 @@
#include "../kfr.h"
#include "impl/intrinsics.h"
+#include "impl/specialconstants.hpp"
#include <climits>
@@ -40,8 +41,8 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers")
#ifdef KFR_TESTING
-#include "../testo/testo.hpp"
#include "../cometa/function.hpp"
+#include "../testo/testo.hpp"
#endif
#include "../cometa.hpp"
@@ -161,7 +162,8 @@ constexpr size_t max_test_size = 32;
template <template <typename, size_t> class vec_tpl, typename T,
typename sizes =
#ifdef KFR_EXTENDED_TESTS
- cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)>
+ cfilter_t<decltype(test_vector_sizes),
+ decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)>
#else
csizes_t<1, 2>
#endif
@@ -255,9 +257,88 @@ struct bitmask
};
template <typename T>
-struct maskbit
+constexpr inline T maskbits(bool value)
+{
+ return value ? special_constants<T>::allones() : special_constants<T>::allzeros();
+}
+
+template <typename T>
+struct bit_value;
+
+template <typename T>
+struct bit
+{
+ alignas(T) bool value;
+ bit() CMT_NOEXCEPT = default;
+
+ constexpr bit(const bit_value<T>& value) CMT_NOEXCEPT : value(static_cast<bool>(value)) {}
+
+ constexpr bit(T value) CMT_NOEXCEPT : value(bitcast_anything<itype<T>>(value) < 0) {}
+ constexpr bit(bool value) CMT_NOEXCEPT : value(value) {}
+
+ template <typename U>
+ constexpr bit(const bit<U>& value) CMT_NOEXCEPT : value(value.value)
+ {
+ }
+
+ constexpr operator bool() const CMT_NOEXCEPT { return value; }
+ constexpr explicit operator T() const CMT_NOEXCEPT { return maskbits<T>(value); }
+};
+
+template <typename T>
+struct bit_value
+{
+ T value;
+ bit_value() CMT_NOEXCEPT = default;
+
+ constexpr bit_value(const bit<T>& value) CMT_NOEXCEPT : bit_value(value.value) {}
+
+ constexpr bit_value(T value) CMT_NOEXCEPT : value(value) {}
+ constexpr bit_value(bool value) CMT_NOEXCEPT : value(maskbits<T>(value)) {}
+
+ template <typename U>
+ constexpr bit_value(const bit_value<U>& value) CMT_NOEXCEPT : bit_value(value.operator bool())
+ {
+ }
+
+ constexpr operator bool() const CMT_NOEXCEPT { return bitcast_anything<itype<T>>(value) < 0; }
+ constexpr explicit operator T() const CMT_NOEXCEPT { return value; }
+};
+
+template <typename T>
+struct special_scalar_constants<bit<T>>
+{
+ constexpr static bit<T> highbitmask() { return true; }
+ constexpr static bit<T> allones() noexcept { return true; };
+ constexpr static bit<T> allzeros() { return false; }
+ constexpr static bit<T> invhighbitmask() { return false; }
+};
+
+namespace internal_generic
+{
+template <typename T>
+struct unwrap_bit
+{
+ using type = T;
+};
+template <typename T>
+struct unwrap_bit<bit<T>>
+{
+ using type = T;
+};
+
+} // namespace internal_generic
+
+template <typename T>
+using unwrap_bit = typename internal_generic::unwrap_bit<T>::type;
+
+template <typename T>
+struct is_bit : cfalse_t
+{
+};
+template <typename T>
+struct is_bit<bit<T>> : ctrue_t
{
- bool value;
};
namespace fn_generic
@@ -340,6 +421,11 @@ struct is_simd_type
{
};
+template <typename T>
+struct is_simd_type<bit<T>> : is_simd_type<T>
+{
+};
+
template <typename T, size_t N>
struct vec_shape
{
diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp
@@ -112,9 +112,6 @@ template <typename T, size_t N>
struct vec;
template <typename T, size_t N>
-struct mask;
-
-template <typename T, size_t N>
struct vec_halves
{
vec<T, prev_poweroftwo(N - 1)> low;
@@ -250,15 +247,15 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
}
// from vector of another type
- template <typename U,
- KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)>
+ template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value &&
+ (compound_type_traits<T>::is_scalar && !is_bit<U>::value))>
KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
: v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v))
{
}
- template <typename U,
- KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
+ template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value &&
+ !(compound_type_traits<T>::is_scalar && !is_bit<U>::value))>
KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
: v(internal::conversion<vec<T, N>, vec<U, N>>::cast(x).v)
{
@@ -412,6 +409,11 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); }
+ KFR_MEM_INTRINSIC constexpr vec<unwrap_bit<T>, N> asvec() const CMT_NOEXCEPT
+ {
+ return vec<unwrap_bit<T>, N>(v);
+ }
+
constexpr static size_t simd_element_size = const_min(vector_width<T>, N);
constexpr static size_t simd_element_count = N / simd_element_size;
using simd_element_type = simd<ST, simd_element_size>;
@@ -1152,7 +1154,18 @@ void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isap
namespace internal
{
-// vector<vector> to vector<vector>
+// vector to vector<vector>
+template <typename To, typename From, size_t N>
+struct conversion<vec<bit<To>, N>, vec<bit<From>, N>>
+{
+ static vec<bit<To>, N> cast(const vec<bit<From>, N>& value)
+ {
+ return vec<To, N>::frombits(innercast<itype<To>>(vec<itype<From>, N>::frombits(value.asvec())))
+ .asmask();
+ }
+};
+
+// vector to vector<vector>
template <typename To, typename From, size_t N1, size_t N2, size_t Ns1>
struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>>
{
diff --git a/sources.cmake b/sources.cmake
@@ -136,6 +136,7 @@ set(
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specialconstants.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i
${PROJECT_SOURCE_DIR}/include/kfr/testo/assert.hpp
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -33,7 +33,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
if (ENABLE_ASMTEST)
add_executable(asm_test asm_test.cpp)
target_link_libraries(asm_test kfr)
- target_set_arch(asm_test PRIVATE avx2)
+ target_set_arch(asm_test PRIVATE sse2)
target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
target_compile_options(asm_test PRIVATE -fno-stack-protector)
@@ -153,9 +153,7 @@ if(USE_SDE)
find_program(EMULATOR "sde")
list(APPEND EMULATOR "-skx")
list(APPEND EMULATOR "--")
-elseif (ARM)
- find_program(EMULATOR "qemu-arm")
-else ()
+elseif (NOT EMULATOR)
set(EMULATOR "")
endif ()
diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp
@@ -133,6 +133,13 @@ TEST(mix)
});
}
+TEST(expression_mask)
+{
+ univector<float> x(100);
+ univector<float> y(100);
+ x = select(x > y, 0.5f, 0.1f) * (y - x) + x;
+}
+
constexpr inline size_t fast_range_sum(size_t stop) { return stop * (stop + 1) / 2; }
TEST(partition)
diff --git a/tests/unit/simd/operators.cpp b/tests/unit/simd/operators.cpp
@@ -142,7 +142,7 @@ TEST(eq)
{
test_function2(test_catogories::vectors, [](auto x, auto y) { return (x == y).asvec(); },
[](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
- return internal::maskbits<subtype<decltype(x)>>(x == y);
+ return maskbits<subtype<decltype(x)>>(x == y);
});
}
@@ -150,7 +150,7 @@ TEST(ne)
{
test_function2(test_catogories::vectors, [](auto x, auto y) { return (x != y).asvec(); },
[](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
- return internal::maskbits<subtype<decltype(x)>>(x != y);
+ return maskbits<subtype<decltype(x)>>(x != y);
});
}
@@ -158,7 +158,7 @@ TEST(ge)
{
test_function2(test_catogories::vectors, [](auto x, auto y) { return (x >= y).asvec(); },
[](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
- return internal::maskbits<subtype<decltype(x)>>(x >= y);
+ return maskbits<subtype<decltype(x)>>(x >= y);
});
}
@@ -166,7 +166,7 @@ TEST(le)
{
test_function2(test_catogories::vectors, [](auto x, auto y) { return (x <= y).asvec(); },
[](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
- return internal::maskbits<subtype<decltype(x)>>(x <= y);
+ return maskbits<subtype<decltype(x)>>(x <= y);
});
}
@@ -174,7 +174,7 @@ TEST(gt)
{
test_function2(test_catogories::vectors, [](auto x, auto y) { return (x > y).asvec(); },
[](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
- return internal::maskbits<subtype<decltype(x)>>(x > y);
+ return maskbits<subtype<decltype(x)>>(x > y);
});
}
@@ -182,7 +182,7 @@ TEST(lt)
{
test_function2(test_catogories::vectors, [](auto x, auto y) { return (x < y).asvec(); },
[](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
- return internal::maskbits<subtype<decltype(x)>>(x < y);
+ return maskbits<subtype<decltype(x)>>(x < y);
});
}
diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp
@@ -124,11 +124,31 @@ TEST(unaligned_read)
for (size_t i = 0; i < N; i++)
{
-// testo::scope sc(as_string("i = ", i));
+ testo::scope sc(as_string("i = ", i));
CHECK(read<N, false>(data + i) == (enumerate<Tsub, N>() + static_cast<Tsub>(i)));
}
});
}
+TEST(mask_broadcast)
+{
+ CHECK(mask<i32, 4>(mask<f32, 4>(true, false, true, false)).asvec() == vec<i32, 4>(-1, 0, -1, 0));
+ CHECK(mask<i32, 4>(mask<f32, 4>(true)).asvec() == vec<i32, 4>(-1, -1, -1, -1));
+}
+
+TEST(masks)
+{
+ mask<float, 4> m = make_mask<float>(false, true, false, true);
+ vec<float, 4> v = m.asvec();
+ CHECK(bit<float>(m[0]) == false);
+ CHECK(bit<float>(m[1]) == true);
+ CHECK(bit<float>(m[2]) == false);
+ CHECK(bit<float>(m[3]) == true);
+ CHECK(float(v[0]) == maskbits<float>(false));
+ CHECK(float(v[1]) == maskbits<float>(true));
+ CHECK(float(v[2]) == maskbits<float>(false));
+ CHECK(float(v[3]) == maskbits<float>(true));
+}
+
} // namespace CMT_ARCH_NAME
} // namespace kfr