make mask<> a specialization of vec<> - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 320a0bf21df43c4494344571e66570d8a8dcb684
parent c6074e188884ca756adfef7b8b7ea0fd1d73b5f8
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Mon, 11 Mar 2019 15:54:04 +0000

make mask<> a specialization of vec<>

Diffstat:
A cmake/aarch64.cmake  | 37 +++++++++++++++++++++++++++++++++++++
M cmake/arm.cmake  | 11 ++++++++---
M include/kfr/cident.h  | 2 +-
M include/kfr/cometa/numeric.hpp  | 2 +-
M include/kfr/dsp/window.hpp  | 2 +-
M include/kfr/math/impl/logical.hpp  | 250 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
M include/kfr/math/impl/select.hpp  | 125 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M include/kfr/math/logical.hpp  | 4 ++--
M include/kfr/math/select.hpp  | 2 +-
M include/kfr/simd/impl/backend_clang.hpp  | 8 ++++----
M include/kfr/simd/impl/backend_generic.hpp  | 27 +++++++++++++++------------
M include/kfr/simd/impl/basicoperators_generic.hpp  | 36 ++++++++++++++++++------------------
M include/kfr/simd/impl/function.hpp  | 44 ++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/simd/impl/simd.hpp  | 75 ++++++---------------------------------------------------------------------
A include/kfr/simd/impl/specialconstants.hpp  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/simd/mask.hpp  | 97 ++-----------------------------------------------------------------------------
M include/kfr/simd/operators.hpp  | 4 +++-
M include/kfr/simd/platform.hpp  | 9 ++++++++-
M include/kfr/simd/types.hpp  | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M include/kfr/simd/vec.hpp  | 29 +++++++++++++++++++++--------
M sources.cmake  | 1 +
M tests/CMakeLists.txt  | 6 ++----
M tests/expression_test.cpp  | 7 +++++++
M tests/unit/simd/operators.cpp  | 12 ++++++------
M tests/unit/simd/vec.cpp  | 22 +++++++++++++++++++++-

25 files changed, 589 insertions(+), 418 deletions(-)
diff --git a/cmake/aarch64.cmake b/cmake/aarch64.cmake
@@ -0,0 +1,37 @@
+# For internal use only
+
+set (CMAKE_SYSTEM_NAME Linux)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (ARM True)
+set (AARCH64 True)
+set (CMAKE_SYSTEM_PROCESSOR aarch64)
+set (EMULATOR qemu-aarch64)
+
+include (CMakeForceCompiler)
+CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Clang)
+CMAKE_FORCE_C_COMPILER (/usr/bin/clang Clang)
+set (CMAKE_CXX_COMPILER_WORKS TRUE)
+set (CMAKE_C_COMPILER_WORKS TRUE)
+
+set(TGT_TRIPLET aarch64-linux-gnu)
+
+set (ARM_ROOT "/usr/${TGT_TRIPLET}/include")
+if (NOT GCC_VER)
+    set (GCC_VER 5.4.0)
+endif ()
+set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/${TGT_TRIPLET} -isystem ${ARM_ROOT}")
+
+set (ARM_COMMON_FLAGS "-target ${TGT_TRIPLET} -mcpu=cortex-a72 -static")
+
+set (CMAKE_CXX_FLAGS "${SYS_PATHS} ${ARM_COMMON_FLAGS}")
+set (CMAKE_C_FLAGS " ${SYS_PATHS} ${ARM_COMMON_FLAGS}")
+
+set (CMAKE_CXX_LINK_FLAGS " ${ARM_COMMON_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
+set (CMAKE_C_LINK_FLAGS " ${ARM_COMMON_FLAGS} ${CMAKE_C_LINK_FLAGS}")
+
+message(STATUS "${ARM_COMMON_FLAGS}")
+
+set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/cmake/arm.cmake b/cmake/arm.cmake
@@ -1,8 +1,11 @@
+# For internal use only
+
 set (CMAKE_SYSTEM_NAME Linux)
 set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (ARM True)
 set (CMAKE_SYSTEM_PROCESSOR arm)
+set (EMULATOR qemu-arm)
 
 include (CMakeForceCompiler)
 CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Clang)
@@ -10,13 +13,15 @@ CMAKE_FORCE_C_COMPILER (/usr/bin/clang Clang)
 set (CMAKE_CXX_COMPILER_WORKS TRUE)
 set (CMAKE_C_COMPILER_WORKS TRUE)
 
-set (ARM_ROOT "/usr/arm-linux-gnueabihf/include")
+set(TGT_TRIPLET arm-linux-gnueabihf)
+
+set (ARM_ROOT "/usr/${TGT_TRIPLET}/include")
 if (NOT GCC_VER)
     set (GCC_VER 5.4.0)
 endif ()
-set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/arm-linux-gnueabihf -isystem ${ARM_ROOT}")
+set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/${TGT_TRIPLET} -isystem ${ARM_ROOT}")
 
-set (ARM_COMMON_FLAGS "-target arm-linux-gnueabihf -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static")
+set (ARM_COMMON_FLAGS "-target ${TGT_TRIPLET} -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static")
 
 set (CMAKE_CXX_FLAGS "${SYS_PATHS} ${ARM_COMMON_FLAGS}")
 set (CMAKE_C_FLAGS " ${SYS_PATHS} ${ARM_COMMON_FLAGS}")
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -141,7 +141,7 @@ extern char* gets(char* __s);
 #define CMT_ARCH_BITNESS_NAME "32-bit"
 #endif
 
-#ifdef __ARM_NEON__
+#if defined __ARM_NEON__ || defined __ARM_NEON
 
 #if __ARM_ARCH >= 8 && defined(__aarch64__)
 #define CMT_ARCH_NEON64 1
diff --git a/include/kfr/cometa/numeric.hpp b/include/kfr/cometa/numeric.hpp
@@ -142,7 +142,7 @@ using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>;
 template <typename T>
 struct typebits
 {
-    static_assert(is_number<deep_subtype<T>>::value, "");
+    // static_assert(is_number<deep_subtype<T>>::value, "");
     constexpr static size_t bits  = sizeof(typename compound_type_traits<T>::subtype) * 8;
     constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width;
     using subtype                 = typename compound_type_traits<T>::subtype;
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -131,7 +131,7 @@ struct expression_rectangular : input_expression
                                                     size_t index, vec_shape<T, N>)
     {
         using TI           = utype<T>;
-        const vec<TI, N> i = enumerate(vec<TI, N>()) + static_cast<TI>(index);
+        const vec<TI, N> i = enumerate(vec_shape<TI, N>()) + static_cast<TI>(index);
         return select(i < static_cast<TI>(self.m_size), T(1), T(0));
     }
     size_t size() const { return m_size; }
diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp
@@ -39,100 +39,106 @@ namespace intrinsics
 #if defined CMT_ARCH_SSE41
 
 // horizontal OR
-KFR_INTRINSIC bool bittestany(const u8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u64sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); }
 
 // horizontal AND
-KFR_INTRINSIC bool bittestall(const u8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
 #endif
 
 #if defined CMT_ARCH_AVX
 // horizontal OR
-KFR_INTRINSIC bool bittestany(const f32sse& x) { return !_mm_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const f64sse& x) { return !_mm_testz_pd(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); }
 
-KFR_INTRINSIC bool bittestany(const f32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const f64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
 
-KFR_INTRINSIC bool bittestany(const u8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const u64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const i64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
 
 // horizontal AND
-KFR_INTRINSIC bool bittestall(const f32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const f64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
 
-KFR_INTRINSIC bool bittestall(const f32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const f64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
 
-KFR_INTRINSIC bool bittestall(const u8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const u64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const i64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
 
 #if defined CMT_ARCH_AVX512
 // horizontal OR
-KFR_INTRINSIC bool bittestany(const f32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const f64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const u8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const u16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const u32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const u64avx512& x) { return _mm512_movepi64_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const i64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); }
 
 // horizontal AND
-KFR_INTRINSIC bool bittestall(const f32avx512& x) { return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
-KFR_INTRINSIC bool bittestall(const f64avx512& x) { return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
-KFR_INTRINSIC bool bittestall(const u8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const u16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const u32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const u64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const i8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const i16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const i32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const i64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mf32avx512& x)
+{
+    return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mf64avx512& x)
+{
+    return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
 
 #endif
 
 #elif defined CMT_ARCH_SSE41
-KFR_INTRINSIC bool bittestany(const f32sse& x)
+KFR_INTRINSIC bool bittestany(const mf32sse& x)
 {
     return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
 }
-KFR_INTRINSIC bool bittestany(const f64sse& x)
+KFR_INTRINSIC bool bittestany(const mf64sse& x)
 {
     return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
 }
-KFR_INTRINSIC bool bittestall(const f32sse& x)
+KFR_INTRINSIC bool bittestall(const mf32sse& x)
 {
     return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
 }
-KFR_INTRINSIC bool bittestall(const f64sse& x)
+KFR_INTRINSIC bool bittestall(const mf64sse& x)
 {
     return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
 }
@@ -140,100 +146,100 @@ KFR_INTRINSIC bool bittestall(const f64sse& x)
 
 #if !defined CMT_ARCH_SSE41
 
-KFR_INTRINSIC bool bittestany(const f32sse& x) { return _mm_movemask_ps(x.v); }
-KFR_INTRINSIC bool bittestany(const f64sse& x) { return _mm_movemask_pd(x.v); }
-KFR_INTRINSIC bool bittestany(const u8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const u16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const u32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const u64sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const i64sse& x) { return _mm_movemask_epi8(x.v); }
-
-KFR_INTRINSIC bool bittestall(const f32sse& x) { return !_mm_movemask_ps((~x).v); }
-KFR_INTRINSIC bool bittestall(const f64sse& x) { return !_mm_movemask_pd((~x).v); }
-KFR_INTRINSIC bool bittestall(const u8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const u16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const u32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const u64sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const i64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); }
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); }
+
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); }
 #endif
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
 {
-    return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+    return bittestall(expand_simd(a, bit<T>(true)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
 {
     return bittestall(low(a)) && bittestall(high(a));
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
 {
-    return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+    return bittestany(expand_simd(a, bit<T>(false)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
 {
     return bittestany(low(a)) || bittestany(high(a));
 }
 
 #elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
 
-KFR_INTRINSIC bool bittestall(const u32neon& a)
+KFR_INTRINSIC bool bittestall(const mu32neon& a)
 {
     const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
     return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
 }
 
-KFR_INTRINSIC bool bittestany(const u32neon& a)
+KFR_INTRINSIC bool bittestany(const mu32neon& a)
 {
     const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
     return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
 }
-KFR_INTRINSIC bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); }
-
-KFR_INTRINSIC bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_INTRINSIC bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<u32>(a)); }
+
+KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<u32>(a)); }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
 {
-    return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+    return bittestall(expand_simd(a, bit<T>(true)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
 {
     return bittestall(low(a)) && bittestall(high(a));
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
 {
-    return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+    return bittestany(expand_simd(a, bit<T>(false)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
 {
     return bittestany(low(a)) || bittestany(high(a));
 }
@@ -241,34 +247,34 @@ KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
 #else
 
 template <typename T, size_t N>
-KFR_INTRINSIC bitmask<N> getmask(const vec<T, N>& x)
+KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x)
 {
     typename bitmask<N>::type val = 0;
     for (size_t i = 0; i < N; i++)
     {
-        val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
+        val |= static_cast<int>(x[i]) << i;
     }
     return val;
 }
 
 template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& x)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x)
 {
     return getmask(x).value;
 }
 template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y)
 {
     return bittestany(x & y);
 }
 
 template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& x)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x)
 {
     return !getmask(~x).value;
 }
 template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y)
 {
     return !bittestany(~x & y);
 }
diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp
@@ -35,265 +35,262 @@ namespace intrinsics
 
 #if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
 
-KFR_INTRINSIC u8sse select(const u8sse& m, const u8sse& x, const u8sse& y)
+KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC u16sse select(const u16sse& m, const u16sse& x, const u16sse& y)
+KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC u32sse select(const u32sse& m, const u32sse& x, const u32sse& y)
+KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC u64sse select(const u64sse& m, const u64sse& x, const u64sse& y)
+KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i8sse select(const i8sse& m, const i8sse& x, const i8sse& y)
+KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i16sse select(const i16sse& m, const i16sse& x, const i16sse& y)
+KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i32sse select(const i32sse& m, const i32sse& x, const i32sse& y)
+KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i64sse select(const i64sse& m, const i64sse& x, const i64sse& y)
+KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y)
 {
     return _mm_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC f32sse select(const f32sse& m, const f32sse& x, const f32sse& y)
+KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y)
 {
     return _mm_blendv_ps(y.v, x.v, m.v);
 }
-KFR_INTRINSIC f64sse select(const f64sse& m, const f64sse& x, const f64sse& y)
+KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
 {
     return _mm_blendv_pd(y.v, x.v, m.v);
 }
 
 #if defined CMT_ARCH_AVX
-KFR_INTRINSIC f64avx select(const f64avx& m, const f64avx& x, const f64avx& y)
+KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
 {
     return _mm256_blendv_pd(y.v, x.v, m.v);
 }
-KFR_INTRINSIC f32avx select(const f32avx& m, const f32avx& x, const f32avx& y)
+KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
 {
     return _mm256_blendv_ps(y.v, x.v, m.v);
 }
 #endif
 
 #if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx select(const u8avx& m, const u8avx& x, const u8avx& y)
+KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC u16avx select(const u16avx& m, const u16avx& x, const u16avx& y)
+KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC u32avx select(const u32avx& m, const u32avx& x, const u32avx& y)
+KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC u64avx select(const u64avx& m, const u64avx& x, const u64avx& y)
+KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i8avx select(const i8avx& m, const i8avx& x, const i8avx& y)
+KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i16avx select(const i16avx& m, const i16avx& x, const i16avx& y)
+KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i32avx select(const i32avx& m, const i32avx& x, const i32avx& y)
+KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
-KFR_INTRINSIC i64avx select(const i64avx& m, const i64avx& x, const i64avx& y)
+KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
 {
     return _mm256_blendv_epi8(y.v, x.v, m.v);
 }
 #endif
 
 #if defined CMT_ARCH_AVX512
-KFR_INTRINSIC f64avx512 select(const f64avx512& m, const f64avx512& x, const f64avx512& y)
+KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y)
 {
     return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
 }
-KFR_INTRINSIC f32avx512 select(const f32avx512& m, const f32avx512& x, const f32avx512& y)
+KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y)
 {
     return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
 }
-KFR_INTRINSIC u8avx512 select(const u8avx512& m, const u8avx512& x, const u8avx512& y)
+KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y)
 {
     return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
 }
-KFR_INTRINSIC u16avx512 select(const u16avx512& m, const u16avx512& x, const u16avx512& y)
+KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y)
 {
     return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
 }
-KFR_INTRINSIC u32avx512 select(const u32avx512& m, const u32avx512& x, const u32avx512& y)
+KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y)
 {
     return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
 }
-KFR_INTRINSIC u64avx512 select(const u64avx512& m, const u64avx512& x, const u64avx512& y)
+KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y)
 {
     return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
 }
-KFR_INTRINSIC i8avx512 select(const i8avx512& m, const i8avx512& x, const i8avx512& y)
+KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y)
 {
     return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
 }
-KFR_INTRINSIC i16avx512 select(const i16avx512& m, const i16avx512& x, const i16avx512& y)
+KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y)
 {
     return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
 }
-KFR_INTRINSIC i32avx512 select(const i32avx512& m, const i32avx512& x, const i32avx512& y)
+KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y)
 {
     return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
 }
-KFR_INTRINSIC i64avx512 select(const i64avx512& m, const i64avx512& x, const i64avx512& y)
+KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y)
 {
     return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
 }
 #endif
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
     constexpr size_t Nout = next_simd_width<T>(N);
     return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
         .shuffle(csizeseq<N>);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
-    vec<T, N> r;
-    intrin(r, a, b, c, [](auto x, auto y, auto z) { return intrinsics::select(x, y, z); });
-    return r;
-    //    return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
+    return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
     //    return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
 {
     constexpr size_t Nout = next_simd_width<T>(N);
     return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
 {
     return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
 {
     constexpr size_t Nout = next_simd_width<T>(N);
     return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
 {
     return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
 {
     constexpr size_t Nout = next_simd_width<T>(N);
     return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
         .shuffle(csizeseq<N>);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
 {
     return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
 }
 
 #elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
 
-KFR_INTRINSIC f32neon select(const f32neon& m, const f32neon& x, const f32neon& y)
+KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
 {
     return vbslq_f32(m.v, x.v, y.v);
 }
-KFR_INTRINSIC i8neon select(const i8neon& m, const i8neon& x, const i8neon& y)
+KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y)
 {
     return vbslq_s8(m.v, x.v, y.v);
 }
-KFR_INTRINSIC u8neon select(const u8neon& m, const u8neon& x, const u8neon& y)
+KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y)
 {
     return vbslq_u8(m.v, x.v, y.v);
 }
-KFR_INTRINSIC i16neon select(const i16neon& m, const i16neon& x, const i16neon& y)
+KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
 {
     return vbslq_s16(m.v, x.v, y.v);
 }
-KFR_INTRINSIC u16neon select(const u16neon& m, const u16neon& x, const u16neon& y)
+KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
 {
     return vbslq_u16(m.v, x.v, y.v);
 }
-KFR_INTRINSIC i32neon select(const i32neon& m, const i32neon& x, const i32neon& y)
+KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
 {
     return vbslq_s32(m.v, x.v, y.v);
 }
-KFR_INTRINSIC u32neon select(const u32neon& m, const u32neon& x, const u32neon& y)
+KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
 {
     return vbslq_u32(m.v, x.v, y.v);
 }
-KFR_INTRINSIC i64neon select(const i64neon& m, const i64neon& x, const i64neon& y)
+KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
 {
     return vbslq_s64(m.v, x.v, y.v);
 }
-KFR_INTRINSIC u64neon select(const u64neon& m, const u64neon& x, const u64neon& y)
+KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
 {
     return vbslq_u64(m.v, x.v, y.v);
 }
 
 #ifdef CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
 {
     return vbslq_f64(m.v, x.v, y.v);
 }
 #else
-KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
 {
-    return y ^ ((x ^ y) & m);
+    return y ^ ((x ^ y) & m.asvec());
 }
 #endif
 
 template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
     constexpr size_t Nout = next_simd_width<T>(N);
     return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
         .shuffle(csizeseq<N>);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
     return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
 }
 template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
 {
     return select(m, vec<T, N>(x), vec<T, N>(y));
 }
 template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
 {
     return select(m, x, vec<T, N>(y));
 }
 template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
 {
     return select(m, vec<T, N>(x), y);
 }
@@ -302,22 +299,22 @@ KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& 
 
 // fallback
 template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y)
 {
-    return y ^ ((x ^ y) & m);
+    return y ^ ((x ^ y) & m.asvec());
 }
 template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
 {
     return select(m, vec<T, N>(x), vec<T, N>(y));
 }
 template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
 {
     return select(m, x, vec<T, N>(y));
 }
 template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
 {
     return select(m, vec<T, N>(x), y);
 }
diff --git a/include/kfr/math/logical.hpp b/include/kfr/math/logical.hpp
@@ -39,7 +39,7 @@ inline namespace CMT_ARCH_NAME
 template <typename T, size_t N>
 KFR_INTRINSIC bool all(const mask<T, N>& x)
 {
-    return intrinsics::bittestall(x.asvec());
+    return intrinsics::bittestall(x);
 }
 
 /**
@@ -48,7 +48,7 @@ KFR_INTRINSIC bool all(const mask<T, N>& x)
 template <typename T, size_t N>
 KFR_INTRINSIC bool any(const mask<T, N>& x)
 {
-    return intrinsics::bittestany(x.asvec());
+    return intrinsics::bittestany(x);
 }
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/math/select.hpp b/include/kfr/math/select.hpp
@@ -43,7 +43,7 @@ template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_nume
 KFR_INTRINSIC vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
 {
     static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
-    return intrinsics::select(bitcast<Tout>(m.asvec()), innercast<Tout>(x), innercast<Tout>(y));
+    return intrinsics::select(bitcast<Tout>(m.asvec()).asmask(), innercast<Tout>(x), innercast<Tout>(y));
 }
 
 /**
diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp
@@ -36,7 +36,7 @@ namespace intrinsics
 {
 
 template <typename TT, size_t NN>
-using simd = TT __attribute__((ext_vector_type(NN)));
+using simd = unwrap_bit<TT> __attribute__((ext_vector_type(NN)));
 
 template <typename T, size_t N1>
 KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x);
@@ -51,13 +51,13 @@ KFR_INTRINSIC void simd_make(ctype_t<Tout>) = delete;
 template <typename Tout, typename Arg>
 KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg)
 {
-    return (simd<Tout, 1>){ static_cast<Tout>(arg) };
+    return (simd<Tout, 1>){ static_cast<unwrap_bit<Tout>>(arg) };
 }
 
 template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)>
 KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args)
 {
-    return (simd<Tout, N>){ static_cast<Tout>(args)... };
+    return (simd<Tout, N>){ static_cast<unwrap_bit<Tout>>(args)... };
 }
 
 /// @brief Returns vector with undefined value
@@ -111,7 +111,7 @@ KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x)
 template <typename T, size_t N>
 KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value)
 {
-    return value;
+    return static_cast<unwrap_bit<T>>(value);
 }
 
 template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp
@@ -49,7 +49,7 @@ namespace intrinsics
 {
 
 template <typename T, size_t N>
-using simd = typename simd_type<T, N>::type;
+using simd = typename simd_type<unwrap_bit<T>, N>::type;
 
 template <typename T, size_t N, typename U>
 union simd_small_array {
@@ -767,7 +767,7 @@ KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
 template <typename T, size_t N, size_t... indices>
 KFR_INTRINSIC simd<T, N> from_simd_array_impl(const simd_array<T, N>& x, csizes_t<indices...>) CMT_NOEXCEPT
 {
-    return { x.val[indices]... };
+    return { static_cast<unwrap_bit<T>>(x.val[indices])... };
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(is_simd_small_array<simd<T, N>>::value)>
@@ -806,7 +806,7 @@ KFR_INTRINSIC void simd_make(ctype_t<Tout>) CMT_NOEXCEPT = delete;
 template <typename Tout, typename Arg>
 KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) CMT_NOEXCEPT
 {
-    return simd<Tout, 1>{ static_cast<Tout>(arg) };
+    return simd<Tout, 1>{ static_cast<unwrap_bit<Tout>>(static_cast<Tout>(arg)) };
 }
 
 template <typename T, size_t... indices, typename... Args, size_t N = sizeof...(indices)>
@@ -931,7 +931,7 @@ KFR_INTRINSIC simd<T, N + N> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x,
 template <typename T>
 KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_t<T, 1>, identity<T> value) CMT_NOEXCEPT
 {
-    return { value };
+    return { static_cast<unwrap_bit<T>>(value) };
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2), size_t Nlow = prev_poweroftwo(N - 1)>
@@ -964,7 +964,7 @@ simd_array<T, Nout> simd_shuffle_generic(const simd_array<T, N>& x, const unsign
     for (size_t i = 0; i < Nout; ++i)
     {
         const size_t index = indices[i];
-        result.val[i]      = index >= N ? T() : x.val[index];
+        result.val[i]      = index >= N ? T() : static_cast<T>(x.val[index]);
     }
     return result;
 }
@@ -977,7 +977,9 @@ simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd
     for (size_t i = 0; i < Nout; ++i)
     {
         const size_t index = indices[i];
-        result.val[i]      = index > N1 + N2 ? T() : index >= N1 ? y.val[index - N1] : x.val[index];
+        result.val[i]      = index >= N1 + N2
+                            ? T()
+                            : index >= N1 ? static_cast<T>(y.val[index - N1]) : static_cast<T>(x.val[index]);
     }
     return result;
 }
@@ -992,7 +994,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csiz
     constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
     return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, N>(xx, indices_array));
 #else
-    return from_simd_array<T, Nout>({ (indices > N ? T() : to_simd_array<T, N>(x).val[indices])... });
+    return from_simd_array<T, Nout>(
+        { (indices >= N ? T() : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... });
 #endif
 }
 
@@ -1009,9 +1012,9 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, 
     return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N, N>(xx, yy, indices_array));
 #else
     return from_simd_array<T, Nout>(
-        { (indices > N * 2 ? T()
-                           : indices >= N ? to_simd_array<T, N>(y).val[indices - N]
-                                          : to_simd_array<T, N>(x).val[indices])... });
+        { (indices >= N * 2 ? T()
+                            : indices >= N ? static_cast<T>(to_simd_array<T, N>(y).val[indices - N])
+                                           : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... });
 #endif
 }
 
@@ -1031,8 +1034,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& 
 
     return from_simd_array<T, Nout>(
         { (indices > N1 + N2 ? T()
-                             : indices >= N1 ? to_simd_array<T, N2>(y).val[indices - N1]
-                                             : to_simd_array<T, N1>(x).val[indices])... });
+                             : indices >= N1 ? static_cast<T>(to_simd_array<T, N2>(y).val[indices - N1])
+                                             : static_cast<T>(to_simd_array<T, N1>(x).val[indices]))... });
 #endif
 }
 
diff --git a/include/kfr/simd/impl/basicoperators_generic.hpp b/include/kfr/simd/impl/basicoperators_generic.hpp
@@ -412,19 +412,19 @@ KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) { return _mm_not_si128
 #else
 KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] == y[i]));
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] == y[i]));
 }
 KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] == y[i]));
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] == y[i]));
 }
 KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] != y[i]));
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] != y[i]));
 }
 KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] != y[i]));
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] != y[i]));
 }
 #endif
 
@@ -458,35 +458,35 @@ KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
 #else
 KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] > y[i]));
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] > y[i]));
 }
 KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] > y[i]));
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] > y[i]));
 }
 KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] < y[i]));
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] < y[i]));
 }
 KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] < y[i]));
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] < y[i]));
 }
 KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] >= y[i]));
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] >= y[i]));
 }
 KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] >= y[i]));
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] >= y[i]));
 }
 KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] <= y[i]));
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = maskbits<u64>(x[i] <= y[i]));
 }
 KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y)
 {
-    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] <= y[i]));
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = maskbits<i64>(x[i] <= y[i]));
 }
 #endif
 
@@ -1557,32 +1557,32 @@ KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y)
 template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
 KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y)
 {
-    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] == y[i]));
+    KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] == y[i]));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
 KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y)
 {
-    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] != y[i]));
+    KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] != y[i]));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
 KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y)
 {
-    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] >= y[i]));
+    KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] >= y[i]));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
 KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y)
 {
-    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] <= y[i]));
+    KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] <= y[i]));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
 KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y)
 {
-    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] > y[i]));
+    KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] > y[i]));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
 KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y)
 {
-    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] < y[i]));
+    KFR_COMPONENTWISE_RET(result[i] = maskbits<T>(x[i] < y[i]));
 }
 
 template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
diff --git a/include/kfr/simd/impl/function.hpp b/include/kfr/simd/impl/function.hpp
@@ -105,6 +105,39 @@ using u16avx512 = vec<u16, 32>;
 using u32avx512 = vec<u32, 16>;
 using u64avx512 = vec<u64, 8>;
 
+using mf32sse = mask<f32, 4>;
+using mf64sse = mask<f64, 2>;
+using mi8sse  = mask<i8, 16>;
+using mi16sse = mask<i16, 8>;
+using mi32sse = mask<i32, 4>;
+using mi64sse = mask<i64, 2>;
+using mu8sse  = mask<u8, 16>;
+using mu16sse = mask<u16, 8>;
+using mu32sse = mask<u32, 4>;
+using mu64sse = mask<u64, 2>;
+
+using mf32avx = mask<f32, 8>;
+using mf64avx = mask<f64, 4>;
+using mi8avx  = mask<i8, 32>;
+using mi16avx = mask<i16, 16>;
+using mi32avx = mask<i32, 8>;
+using mi64avx = mask<i64, 4>;
+using mu8avx  = mask<u8, 32>;
+using mu16avx = mask<u16, 16>;
+using mu32avx = mask<u32, 8>;
+using mu64avx = mask<u64, 4>;
+
+using mf32avx512 = mask<f32, 16>;
+using mf64avx512 = mask<f64, 8>;
+using mi8avx512  = mask<i8, 64>;
+using mi16avx512 = mask<i16, 32>;
+using mi32avx512 = mask<i32, 16>;
+using mi64avx512 = mask<i64, 8>;
+using mu8avx512  = mask<u8, 64>;
+using mu16avx512 = mask<u16, 32>;
+using mu32avx512 = mask<u32, 16>;
+using mu64avx512 = mask<u64, 8>;
+
 #else
 using f32neon = vec<f32, 4>;
 using f64neon = vec<f64, 2>;
@@ -116,6 +149,17 @@ using u8neon  = vec<u8, 16>;
 using u16neon = vec<u16, 8>;
 using u32neon = vec<u32, 4>;
 using u64neon = vec<u64, 2>;
+
+using mf32neon = mask<f32, 4>;
+using mf64neon = mask<f64, 2>;
+using mi8neon  = mask<i8, 16>;
+using mi16neon = mask<i16, 8>;
+using mi32neon = mask<i32, 4>;
+using mi64neon = mask<i64, 2>;
+using mu8neon  = mask<u8, 16>;
+using mu16neon = mask<u16, 8>;
+using mu32neon = mask<u32, 4>;
+using mu64neon = mask<u64, 2>;
 #endif
 
 template <typename T>
diff --git a/include/kfr/simd/impl/simd.hpp b/include/kfr/simd/impl/simd.hpp
@@ -30,75 +30,6 @@ namespace kfr
 inline namespace CMT_ARCH_NAME
 {
 
-#if defined CMT_COMPILER_GNU
-constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); }
-constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); }
-constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); }
-constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); }
-#elif defined CMT_COMPILER_MSVC
-constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); }
-constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); }
-constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); }
-constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); }
-#else
-inline f32 allones_f32() CMT_NOEXCEPT
-{
-    return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu)));
-}
-inline f64 allones_f64() CMT_NOEXCEPT
-{
-    return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull)));
-}
-inline f32 invhighbit_f32() CMT_NOEXCEPT
-{
-    return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu)));
-}
-inline f64 invhighbit_f64() CMT_NOEXCEPT
-{
-    return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull)));
-}
-#endif
-
-template <typename T>
-struct special_scalar_constants
-{
-    constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); }
-    constexpr static T allones() { return static_cast<T>(-1ll); }
-    constexpr static T allzeros() { return T(0); }
-    constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); }
-};
-
-#ifndef CMT_COMPILER_INTEL
-#define KFR_CONSTEXPR_NON_INTEL constexpr
-#else
-#define KFR_CONSTEXPR_NON_INTEL
-#endif
-
-template <>
-struct special_scalar_constants<float>
-{
-    constexpr static float highbitmask() { return -0.f; }
-    KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); };
-    constexpr static float allzeros() { return 0.f; }
-    KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); }
-};
-
-template <>
-struct special_scalar_constants<double>
-{
-    constexpr static double highbitmask() { return -0.; }
-    KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); };
-    constexpr static double allzeros() { return 0.; }
-    KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); }
-};
-
-template <typename T>
-struct special_constants : public special_scalar_constants<subtype<T>>
-{
-public:
-    using Tsub = subtype<T>;
-};
-
 namespace intrinsics
 {
 
@@ -142,6 +73,12 @@ struct alignas(alignment<T, N>()) simd_array
 };
 
 template <typename T, size_t N>
+struct alignas(alignment<T, N>()) simd_array<bit<T>, N>
+{
+    bit_value<T> val[next_poweroftwo(N)];
+};
+
+template <typename T, size_t N>
 struct simd_type;
 
 template <typename T>
diff --git a/include/kfr/simd/impl/specialconstants.hpp b/include/kfr/simd/impl/specialconstants.hpp
@@ -0,0 +1,101 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../cometa/numeric.hpp"
+#include "intrinsics.h"
+
+namespace kfr
+{
+using namespace cometa;
+
+#if defined CMT_COMPILER_GNU
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); }
+#elif defined CMT_COMPILER_MSVC
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); }
+#else
+inline f32 allones_f32() CMT_NOEXCEPT
+{
+    return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu)));
+}
+inline f64 allones_f64() CMT_NOEXCEPT
+{
+    return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull)));
+}
+inline f32 invhighbit_f32() CMT_NOEXCEPT
+{
+    return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu)));
+}
+inline f64 invhighbit_f64() CMT_NOEXCEPT
+{
+    return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull)));
+}
+#endif
+
+template <typename T>
+struct special_scalar_constants
+{
+    constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); }
+    constexpr static T allones() { return static_cast<T>(-1ll); }
+    constexpr static T allzeros() { return T(0); }
+    constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); }
+};
+
+#ifndef CMT_COMPILER_INTEL
+#define KFR_CONSTEXPR_NON_INTEL constexpr
+#else
+#define KFR_CONSTEXPR_NON_INTEL
+#endif
+
+template <>
+struct special_scalar_constants<float>
+{
+    constexpr static float highbitmask() { return -0.f; }
+    KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); };
+    constexpr static float allzeros() { return 0.f; }
+    KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); }
+};
+
+template <>
+struct special_scalar_constants<double>
+{
+    constexpr static double highbitmask() { return -0.; }
+    KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); };
+    constexpr static double allzeros() { return 0.; }
+    KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); }
+};
+
+template <typename T>
+struct special_constants : public special_scalar_constants<subtype<T>>
+{
+public:
+    using Tsub = subtype<T>;
+};
+
+} // namespace kfr
diff --git a/include/kfr/simd/mask.hpp b/include/kfr/simd/mask.hpp
@@ -39,60 +39,6 @@ using maskfor = typename T::mask_t;
 namespace internal
 {
 
-template <typename T>
-constexpr inline T maskbits(bool value)
-{
-    return value ? special_constants<T>::allones() : special_constants<T>::allzeros();
-}
-} // namespace internal
-
-template <typename T, size_t N>
-struct mask : protected vec<T, N>
-{
-    using base = vec<T, N>;
-
-    KFR_MEM_INTRINSIC mask() CMT_NOEXCEPT = default;
-
-    KFR_MEM_INTRINSIC mask(const mask&) CMT_NOEXCEPT = default;
-
-    KFR_MEM_INTRINSIC mask& operator=(const mask&) CMT_NOEXCEPT = default;
-
-    using simd_type = typename base::simd_type;
-
-    KFR_MEM_INTRINSIC mask(bool arg) : base(internal::maskbits<T>(arg)) {}
-
-    template <typename... Args>
-    KFR_MEM_INTRINSIC mask(bool arg1, bool arg2, Args... args)
-        : base(internal::maskbits<T>(arg1), internal::maskbits<T>(arg2),
-               internal::maskbits<T>(static_cast<bool>(args))...)
-    {
-    }
-
-    using vec<T, N>::v;
-
-    KFR_MEM_INTRINSIC mask(const base& v) CMT_NOEXCEPT;
-
-    KFR_MEM_INTRINSIC mask(const simd_type& simd) : base(simd) {}
-
-    template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
-    KFR_MEM_INTRINSIC mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
-    {
-    }
-
-    template <typename U, KFR_ENABLE_IF(sizeof(T) != sizeof(U))>
-    KFR_MEM_INTRINSIC mask(const mask<U, N>& m)
-        : base(base::frombits(innercast<itype<T>>(vec<itype<U>, N>::frombits(m.asvec()))))
-    {
-    }
-
-    KFR_MEM_INTRINSIC bool operator[](size_t index) const CMT_NOEXCEPT;
-
-    KFR_MEM_INTRINSIC constexpr base asvec() const CMT_NOEXCEPT { return base(v); }
-};
-
-namespace internal
-{
-
 template <typename T, size_t Nout, size_t N1, size_t... indices>
 constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>)
 {
@@ -106,50 +52,11 @@ constexpr vec<T, Nout> partial_mask()
 }
 } // namespace internal
 
-template <typename T, size_t N>
-KFR_MEM_INTRINSIC bool mask<T, N>::operator[](size_t index) const CMT_NOEXCEPT
-{
-    return ibitcast(base::operator[](index)) < 0;
-}
-
 template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)>
-constexpr KFR_INTRINSIC mask<T, Nout> make_mask(bool arg, Args... args)
+constexpr KFR_INTRINSIC vec<bit<T>, Nout> make_mask(bool arg, Args... args)
 {
-    return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...);
+    return vec<bit<T>, Nout>(arg, static_cast<bool>(args)...);
 }
 
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
-
-namespace cometa
-{
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::mask<T, N>>
-{
-    using subtype                      = T;
-    using deep_subtype                 = cometa::deep_subtype<T>;
-    constexpr static size_t width      = N;
-    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
-    constexpr static bool is_scalar    = false;
-    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
-    template <typename U>
-    using rebind = kfr::mask<U, N>;
-    template <typename U>
-    using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-
-    KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::mask<T, N>& value, size_t index)
-    {
-        return value[index];
-    }
-};
-} // namespace cometa
-
-namespace std
-{
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
-{
-    using type = kfr::mask<typename common_type<T1, T2>::type, N>;
-};
-} // namespace std
diff --git a/include/kfr/simd/operators.hpp b/include/kfr/simd/operators.hpp
@@ -800,11 +800,13 @@ vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<
 
 KFR_FN(packtranspose)
 
+#if 0
 template <typename T, size_t N>
-KFR_I_CE mask<T, N>::mask(const base& v) CMT_NOEXCEPT
+KFR_I_CE vec<bit<T>, N>::vec(const base& v) CMT_NOEXCEPT
 {
     this->v = base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()).v;
 }
+#endif
 
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp
@@ -161,6 +161,8 @@ struct platform<cpu_t::common>
     constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
 
     constexpr static bool fast_unaligned = false;
+
+    constexpr static bool mask_registers = false;
 };
 template <>
 struct platform<cpu_t::sse2> : platform<cpu_t::common>
@@ -208,6 +210,8 @@ struct platform<cpu_t::avx512> : platform<cpu_t::avx2>
     constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
 
     constexpr static size_t simd_register_count = bitness_const(8, 32);
+
+    constexpr static bool mask_registers = true;
 };
 #endif
 #ifdef CMT_ARCH_ARM
@@ -234,6 +238,8 @@ struct platform<cpu_t::common>
     constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
 
     constexpr static bool fast_unaligned = false;
+
+    constexpr static bool mask_registers = false;
 };
 template <>
 struct platform<cpu_t::neon> : platform<cpu_t::common>
@@ -279,8 +285,9 @@ constexpr static bool is_simd_size(size_t size)
 
 template <typename T, size_t N = vector_width<T>>
 struct vec;
+
 template <typename T, size_t N = vector_width<T>>
-struct mask;
+using mask = vec<bit<T>, N>;
 
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp
@@ -28,6 +28,7 @@
 #include "../kfr.h"
 
 #include "impl/intrinsics.h"
+#include "impl/specialconstants.hpp"
 
 #include <climits>
 
@@ -40,8 +41,8 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers")
 
 #ifdef KFR_TESTING
-#include "../testo/testo.hpp"
 #include "../cometa/function.hpp"
+#include "../testo/testo.hpp"
 #endif
 
 #include "../cometa.hpp"
@@ -161,7 +162,8 @@ constexpr size_t max_test_size = 32;
 template <template <typename, size_t> class vec_tpl, typename T,
           typename sizes =
 #ifdef KFR_EXTENDED_TESTS
-              cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)>
+              cfilter_t<decltype(test_vector_sizes),
+                        decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)>
 #else
               csizes_t<1, 2>
 #endif
@@ -255,9 +257,88 @@ struct bitmask
 };
 
 template <typename T>
-struct maskbit
+constexpr inline T maskbits(bool value)
+{
+    return value ? special_constants<T>::allones() : special_constants<T>::allzeros();
+}
+
+template <typename T>
+struct bit_value;
+
+template <typename T>
+struct bit
+{
+    alignas(T) bool value;
+    bit() CMT_NOEXCEPT = default;
+
+    constexpr bit(const bit_value<T>& value) CMT_NOEXCEPT : value(static_cast<bool>(value)) {}
+
+    constexpr bit(T value) CMT_NOEXCEPT : value(bitcast_anything<itype<T>>(value) < 0) {}
+    constexpr bit(bool value) CMT_NOEXCEPT : value(value) {}
+
+    template <typename U>
+    constexpr bit(const bit<U>& value) CMT_NOEXCEPT : value(value.value)
+    {
+    }
+
+    constexpr operator bool() const CMT_NOEXCEPT { return value; }
+    constexpr explicit operator T() const CMT_NOEXCEPT { return maskbits<T>(value); }
+};
+
+template <typename T>
+struct bit_value
+{
+    T value;
+    bit_value() CMT_NOEXCEPT = default;
+
+    constexpr bit_value(const bit<T>& value) CMT_NOEXCEPT : bit_value(value.value) {}
+
+    constexpr bit_value(T value) CMT_NOEXCEPT : value(value) {}
+    constexpr bit_value(bool value) CMT_NOEXCEPT : value(maskbits<T>(value)) {}
+
+    template <typename U>
+    constexpr bit_value(const bit_value<U>& value) CMT_NOEXCEPT : bit_value(value.operator bool())
+    {
+    }
+
+    constexpr operator bool() const CMT_NOEXCEPT { return bitcast_anything<itype<T>>(value) < 0; }
+    constexpr explicit operator T() const CMT_NOEXCEPT { return value; }
+};
+
+template <typename T>
+struct special_scalar_constants<bit<T>>
+{
+    constexpr static bit<T> highbitmask() { return true; }
+    constexpr static bit<T> allones() noexcept { return true; };
+    constexpr static bit<T> allzeros() { return false; }
+    constexpr static bit<T> invhighbitmask() { return false; }
+};
+
+namespace internal_generic
+{
+template <typename T>
+struct unwrap_bit
+{
+    using type = T;
+};
+template <typename T>
+struct unwrap_bit<bit<T>>
+{
+    using type = T;
+};
+
+} // namespace internal_generic
+
+template <typename T>
+using unwrap_bit = typename internal_generic::unwrap_bit<T>::type;
+
+template <typename T>
+struct is_bit : cfalse_t
+{
+};
+template <typename T>
+struct is_bit<bit<T>> : ctrue_t
 {
-    bool value;
 };
 
 namespace fn_generic
@@ -340,6 +421,11 @@ struct is_simd_type
 {
 };
 
+template <typename T>
+struct is_simd_type<bit<T>> : is_simd_type<T>
+{
+};
+
 template <typename T, size_t N>
 struct vec_shape
 {
diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp
@@ -112,9 +112,6 @@ template <typename T, size_t N>
 struct vec;
 
 template <typename T, size_t N>
-struct mask;
-
-template <typename T, size_t N>
 struct vec_halves
 {
     vec<T, prev_poweroftwo(N - 1)> low;
@@ -250,15 +247,15 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
     }
 
     // from vector of another type
-    template <typename U,
-              KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)>
+    template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value &&
+                                        (compound_type_traits<T>::is_scalar && !is_bit<U>::value))>
     KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
         : v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v))
     {
     }
 
-    template <typename U,
-              KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
+    template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value &&
+                                        !(compound_type_traits<T>::is_scalar && !is_bit<U>::value))>
     KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
         : v(internal::conversion<vec<T, N>, vec<U, N>>::cast(x).v)
     {
@@ -412,6 +409,11 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
 
     KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); }
 
+    KFR_MEM_INTRINSIC constexpr vec<unwrap_bit<T>, N> asvec() const CMT_NOEXCEPT
+    {
+        return vec<unwrap_bit<T>, N>(v);
+    }
+
     constexpr static size_t simd_element_size  = const_min(vector_width<T>, N);
     constexpr static size_t simd_element_count = N / simd_element_size;
     using simd_element_type                    = simd<ST, simd_element_size>;
@@ -1152,7 +1154,18 @@ void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isap
 
 namespace internal
 {
-// vector<vector> to vector<vector>
+// vector to vector<vector>
+template <typename To, typename From, size_t N>
+struct conversion<vec<bit<To>, N>, vec<bit<From>, N>>
+{
+    static vec<bit<To>, N> cast(const vec<bit<From>, N>& value)
+    {
+        return vec<To, N>::frombits(innercast<itype<To>>(vec<itype<From>, N>::frombits(value.asvec())))
+            .asmask();
+    }
+};
+
+// vector to vector<vector>
 template <typename To, typename From, size_t N1, size_t N2, size_t Ns1>
 struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>>
 {
diff --git a/sources.cmake b/sources.cmake
@@ -136,6 +136,7 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specialconstants.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i
     ${PROJECT_SOURCE_DIR}/include/kfr/testo/assert.hpp
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -33,7 +33,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
 if (ENABLE_ASMTEST)
     add_executable(asm_test asm_test.cpp)
     target_link_libraries(asm_test kfr)
-    target_set_arch(asm_test PRIVATE avx2)
+    target_set_arch(asm_test PRIVATE sse2)
     target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED)
     if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         target_compile_options(asm_test PRIVATE -fno-stack-protector)
@@ -153,9 +153,7 @@ if(USE_SDE)
     find_program(EMULATOR "sde")
     list(APPEND EMULATOR "-skx")
     list(APPEND EMULATOR "--")
-elseif (ARM)
-    find_program(EMULATOR "qemu-arm")
-else ()
+elseif (NOT EMULATOR)
     set(EMULATOR "")
 endif ()
 
diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp
@@ -133,6 +133,13 @@ TEST(mix)
     });
 }
 
+TEST(expression_mask)
+{
+    univector<float> x(100);
+    univector<float> y(100);
+    x = select(x > y, 0.5f, 0.1f) * (y - x) + x;
+}
+
 constexpr inline size_t fast_range_sum(size_t stop) { return stop * (stop + 1) / 2; }
 
 TEST(partition)
diff --git a/tests/unit/simd/operators.cpp b/tests/unit/simd/operators.cpp
@@ -142,7 +142,7 @@ TEST(eq)
 {
     test_function2(test_catogories::vectors, [](auto x, auto y) { return (x == y).asvec(); },
                    [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
-                       return internal::maskbits<subtype<decltype(x)>>(x == y);
+                       return maskbits<subtype<decltype(x)>>(x == y);
                    });
 }
 
@@ -150,7 +150,7 @@ TEST(ne)
 {
     test_function2(test_catogories::vectors, [](auto x, auto y) { return (x != y).asvec(); },
                    [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
-                       return internal::maskbits<subtype<decltype(x)>>(x != y);
+                       return maskbits<subtype<decltype(x)>>(x != y);
                    });
 }
 
@@ -158,7 +158,7 @@ TEST(ge)
 {
     test_function2(test_catogories::vectors, [](auto x, auto y) { return (x >= y).asvec(); },
                    [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
-                       return internal::maskbits<subtype<decltype(x)>>(x >= y);
+                       return maskbits<subtype<decltype(x)>>(x >= y);
                    });
 }
 
@@ -166,7 +166,7 @@ TEST(le)
 {
     test_function2(test_catogories::vectors, [](auto x, auto y) { return (x <= y).asvec(); },
                    [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
-                       return internal::maskbits<subtype<decltype(x)>>(x <= y);
+                       return maskbits<subtype<decltype(x)>>(x <= y);
                    });
 }
 
@@ -174,7 +174,7 @@ TEST(gt)
 {
     test_function2(test_catogories::vectors, [](auto x, auto y) { return (x > y).asvec(); },
                    [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
-                       return internal::maskbits<subtype<decltype(x)>>(x > y);
+                       return maskbits<subtype<decltype(x)>>(x > y);
                    });
 }
 
@@ -182,7 +182,7 @@ TEST(lt)
 {
     test_function2(test_catogories::vectors, [](auto x, auto y) { return (x < y).asvec(); },
                    [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
-                       return internal::maskbits<subtype<decltype(x)>>(x < y);
+                       return maskbits<subtype<decltype(x)>>(x < y);
                    });
 }
 
diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp
@@ -124,11 +124,31 @@ TEST(unaligned_read)
 
         for (size_t i = 0; i < N; i++)
         {
-//            testo::scope sc(as_string("i = ", i));
+            testo::scope sc(as_string("i = ", i));
             CHECK(read<N, false>(data + i) == (enumerate<Tsub, N>() + static_cast<Tsub>(i)));
         }
     });
 }
 
+TEST(mask_broadcast)
+{
+    CHECK(mask<i32, 4>(mask<f32, 4>(true, false, true, false)).asvec() == vec<i32, 4>(-1, 0, -1, 0));
+    CHECK(mask<i32, 4>(mask<f32, 4>(true)).asvec() == vec<i32, 4>(-1, -1, -1, -1));
+}
+
+TEST(masks)
+{
+    mask<float, 4> m = make_mask<float>(false, true, false, true);
+    vec<float, 4> v  = m.asvec();
+    CHECK(bit<float>(m[0]) == false);
+    CHECK(bit<float>(m[1]) == true);
+    CHECK(bit<float>(m[2]) == false);
+    CHECK(bit<float>(m[3]) == true);
+    CHECK(float(v[0]) == maskbits<float>(false));
+    CHECK(float(v[1]) == maskbits<float>(true));
+    CHECK(float(v[2]) == maskbits<float>(false));
+    CHECK(float(v[3]) == maskbits<float>(true));
+}
+
 } // namespace CMT_ARCH_NAME
 } // namespace kfr

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

A	cmake/aarch64.cmake	\|	37	+++++++++++++++++++++++++++++++++++++
M	cmake/arm.cmake	\|	11	++++++++---
M	include/kfr/cident.h	\|	2	+-
M	include/kfr/cometa/numeric.hpp	\|	2	+-
M	include/kfr/dsp/window.hpp	\|	2	+-
M	include/kfr/math/impl/logical.hpp	\|	250	++++++++++++++++++++++++++++++++++++++++---------------------------------------
M	include/kfr/math/impl/select.hpp	\|	125	+++++++++++++++++++++++++++++++++++++++----------------------------------------
M	include/kfr/math/logical.hpp	\|	4	++--
M	include/kfr/math/select.hpp	\|	2	+-
M	include/kfr/simd/impl/backend_clang.hpp	\|	8	++++----
M	include/kfr/simd/impl/backend_generic.hpp	\|	27	+++++++++++++++------------
M	include/kfr/simd/impl/basicoperators_generic.hpp	\|	36	++++++++++++++++++------------------
M	include/kfr/simd/impl/function.hpp	\|	44	++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/simd/impl/simd.hpp	\|	75	++++++---------------------------------------------------------------------
A	include/kfr/simd/impl/specialconstants.hpp	\|	101	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/simd/mask.hpp	\|	97	++-----------------------------------------------------------------------------
M	include/kfr/simd/operators.hpp	\|	4	+++-
M	include/kfr/simd/platform.hpp	\|	9	++++++++-
M	include/kfr/simd/types.hpp	\|	94	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	include/kfr/simd/vec.hpp	\|	29	+++++++++++++++++++++--------
M	sources.cmake	\|	1	+
M	tests/CMakeLists.txt	\|	6	++----
M	tests/expression_test.cpp	\|	7	+++++++
M	tests/unit/simd/operators.cpp	\|	12	++++++------
M	tests/unit/simd/vec.cpp	\|	22	+++++++++++++++++++++-