Refactoring - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 940f5d9c26ef2a724a5faf4f421470ca6ef72564
parent 51d4e60bd55b1a68556ac5ca2d443c970769adb1
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Wed, 12 Oct 2022 14:40:12 +0100

Refactoring

Diffstat:
M include/kfr/base.hpp  | 2 --
M include/kfr/base/basic_expressions.hpp  | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
M include/kfr/base/conversion.hpp  | 2 +-
M include/kfr/base/expression.hpp  | 25 +++++++++++++++++++++++--
M include/kfr/base/generators.hpp  | 226 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M include/kfr/base/pointer.hpp  | 204 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M include/kfr/base/random.hpp  | 80 +------------------------------------------------------------------------------
A include/kfr/base/random_bits.hpp  | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/base/reduce.hpp  | 2 +-
M include/kfr/base/shape.hpp  | 36 +++++++++++++++++++++++++++---------
D include/kfr/base/sort.hpp  | 103 -------------------------------------------------------------------------------
M include/kfr/base/tensor.hpp  | 6 +++---
M include/kfr/math.hpp  | 7 -------
M include/kfr/math/complex_math.hpp  | 6 +++---
D include/kfr/math/impl/abs.hpp  | 138 -------------------------------------------------------------------------------
M include/kfr/math/impl/asin_acos.hpp  | 6 +++---
M include/kfr/math/impl/atan.hpp  | 6 +++---
D include/kfr/math/impl/clamp.hpp  | 55 -------------------------------------------------------
M include/kfr/math/impl/hyperbolic.hpp  | 8 ++++----
M include/kfr/math/impl/log_exp.hpp  | 10 +++++-----
D include/kfr/math/impl/logical.hpp  | 284 -------------------------------------------------------------------------------
D include/kfr/math/impl/min_max.hpp  | 236 -------------------------------------------------------------------------------
D include/kfr/math/impl/round.hpp  | 282 -------------------------------------------------------------------------------
D include/kfr/math/impl/saturation.hpp  | 205 -------------------------------------------------------------------------------
D include/kfr/math/impl/select.hpp  | 331 -------------------------------------------------------------------------------
M include/kfr/math/impl/sin_cos.hpp  | 8 ++++----
M include/kfr/math/impl/tan.hpp  | 6 +++---
M include/kfr/math/interpolation.hpp  | 2 +-
M include/kfr/simd.hpp  | 8 ++++++++
R include/kfr/math/abs.hpp -> include/kfr/simd/abs.hpp  | 0 
R include/kfr/math/clamp.hpp -> include/kfr/simd/clamp.hpp  | 0 
A include/kfr/simd/impl/abs.hpp  | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/clamp.hpp  | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/logical.hpp  | 284 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/min_max.hpp  | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/round.hpp  | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/saturation.hpp  | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/select.hpp  | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R include/kfr/math/logical.hpp -> include/kfr/simd/logical.hpp  | 0 
R include/kfr/math/min_max.hpp -> include/kfr/simd/min_max.hpp  | 0 
R include/kfr/math/round.hpp -> include/kfr/simd/round.hpp  | 0 
R include/kfr/math/saturation.hpp -> include/kfr/simd/saturation.hpp  | 0 
R include/kfr/math/select.hpp -> include/kfr/simd/select.hpp  | 0 
A include/kfr/simd/sort.hpp  | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M sources.cmake  | 40 +++++++++++++++++++++-------------------
R tests/unit/math/abs.cpp -> tests/unit/simd/abs.cpp  | 0 
R tests/unit/math/min_max.cpp -> tests/unit/simd/min_max.cpp  | 0 
R tests/unit/math/round.cpp -> tests/unit/simd/round.cpp  | 0 
R tests/unit/math/select.cpp -> tests/unit/simd/select.cpp  | 0

49 files changed, 2199 insertions(+), 1994 deletions(-)
diff --git a/include/kfr/base.hpp b/include/kfr/base.hpp
@@ -29,7 +29,6 @@
 #include "base/expression.hpp"
 #include "base/filter.hpp"
 #include "base/fraction.hpp"
-#include "base/function_expressions.hpp"
 #include "base/generators.hpp"
 #include "base/math_expressions.hpp"
 #include "base/memory.hpp"
@@ -38,5 +37,4 @@
 #include "base/reduce.hpp"
 #include "base/simd_expressions.hpp"
 #include "base/small_buffer.hpp"
-#include "base/sort.hpp"
 #include "base/univector.hpp"
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -93,8 +93,8 @@ struct expression_traits<xcounter<T, Dims>> : expression_traits_defaults
     using value_type             = T;
     constexpr static size_t dims = Dims;
 
-    constexpr static shape<dims> shapeof(const xcounter<T, Dims>& self) { return shape<dims>(max_index_t); }
-    constexpr static shape<dims> shapeof() { return shape<dims>(max_index_t); }
+    constexpr static shape<dims> shapeof(const xcounter<T, Dims>& self) { return shape<dims>(infinite_size); }
+    constexpr static shape<dims> shapeof() { return shape<dims>(infinite_size); }
 };
 
 template <typename T, typename... Args, typename Tout = std::common_type_t<T, Args...>>
@@ -151,9 +151,9 @@ struct expression_traits<xslice<Arg>> : expression_traits_defaults
 
     KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xslice<Arg>& self)
     {
-        return min(ArgTraits::shapeof(self.first()).sub_inf(self.start), self.size);
+        return min(sub_shape(ArgTraits::shapeof(self.first()), self.start), self.size);
     }
-    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(0); }
+    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(undefined_size); }
 };
 
 template <typename Arg, KFR_ACCEPT_EXPRESSIONS(Arg), index_t Dims = expression_dims<Arg>>
@@ -333,8 +333,11 @@ struct expression_traits<xpadded<Arg>> : expression_traits_defaults
     constexpr static size_t dims        = ArgTraits::dims;
     constexpr static bool random_access = ArgTraits::random_access;
 
-    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xpadded<Arg>& self) { return infinite_size; }
-    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return infinite_size; }
+    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xpadded<Arg>& self)
+    {
+        return shape<dims>(infinite_size);
+    }
+    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(infinite_size); }
 };
 
 inline namespace CMT_ARCH_NAME
@@ -516,7 +519,7 @@ struct expression_traits<xreshape<Arg, OutDims>> : expression_traits_defaults
     {
         return self.out_shape;
     }
-    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ 0 }; }
+    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ undefined_size }; }
 };
 
 inline namespace CMT_ARCH_NAME
@@ -647,7 +650,7 @@ struct expression_traits<xlinspace<T, truncated>> : expression_traits_defaults
     {
         return shape<dims>(truncated ? self.size : infinite_size);
     }
-    constexpr static shape<dims> shapeof() { return shape<dims>(truncated ? 0 : infinite_size); }
+    constexpr static shape<dims> shapeof() { return shape<dims>(truncated ? undefined_size : infinite_size); }
 };
 
 template <bool truncated = false, typename T1, typename T2, typename Tout = std::common_type_t<T1, T2>>
@@ -672,4 +675,106 @@ KFR_INTRINSIC vec<T, N> get_elements(const xlinspace<T, truncated>& self, const 
 
 // ----------------------------------------------------------------------------
 
+template <typename Arg1, typename Arg2, index_t ConcatAxis>
+struct xconcatenate : public xwitharguments<Arg1, Arg2>
+{
+    static_assert(expression_dims<Arg1> == expression_dims<Arg2>);
+    static_assert(std::is_same_v<expression_value_type<Arg1>, expression_value_type<Arg2>>);
+    constexpr static index_t dims = expression_dims<Arg1>;
+    shape<dims> size1;
+
+    KFR_MEM_INTRINSIC xconcatenate(Arg1&& arg1, Arg2&& arg2)
+        : xwitharguments<Arg1, Arg2>{ std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) },
+          size1(expression_traits<Arg1>::shapeof(arg1))
+    {
+    }
+};
+
+template <typename Arg1, typename Arg2, index_t ConcatAxis>
+struct expression_traits<xconcatenate<Arg1, Arg2, ConcatAxis>> : expression_traits_defaults
+{
+    using ArgTraits1 = expression_traits<Arg1>;
+    using ArgTraits2 = expression_traits<Arg2>;
+
+    using value_type                    = typename ArgTraits1::value_type;
+    constexpr static size_t dims        = ArgTraits1::dims;
+    constexpr static bool random_access = ArgTraits1::random_access && ArgTraits2::random_access;
+
+    KFR_INTRINSIC static shape<dims> concat_shape(const shape<dims>& sh1, const shape<dims>& sh2)
+    {
+        shape<dims> result = min(sh1, sh2);
+        shape<dims> sum    = add_shape_undef(sh1, sh2);
+        result[ConcatAxis] = sum[ConcatAxis];
+        return result;
+    }
+
+    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xconcatenate<Arg1, Arg2, ConcatAxis>& self)
+    {
+        return concat_shape(ArgTraits1::shapeof(std::get<0>(self)), ArgTraits2::shapeof(std::get<1>(self)));
+    }
+    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof()
+    {
+        return concat_shape(ArgTraits1::shapeof(), ArgTraits2::shapeof());
+    }
+};
+
+template <index_t ConcatAxis = 0, typename Arg1, typename Arg2, KFR_ACCEPT_EXPRESSIONS(Arg1, Arg2)>
+KFR_INTRINSIC xconcatenate<Arg1, Arg2, ConcatAxis> concatenate(Arg1&& arg1, Arg2&& arg2)
+{
+    return { std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) };
+}
+
+template <index_t ConcatAxis = 0, typename Arg1, typename Arg2, typename Arg3,
+          KFR_ACCEPT_EXPRESSIONS(Arg1, Arg2, Arg3)>
+KFR_INTRINSIC xconcatenate<Arg1, xconcatenate<Arg2, Arg3, ConcatAxis>, ConcatAxis> concatenate(Arg1&& arg1,
+                                                                                               Arg2&& arg2,
+                                                                                               Arg3&& arg3)
+{
+    return { std::forward<Arg1>(arg1), { std::forward<Arg2>(arg2), std::forward<Arg3>(arg3) } };
+}
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename Arg1, typename Arg2, index_t ConcatAxis, index_t NDims, index_t Axis, size_t N,
+          typename T = typename expression_traits<xconcatenate<Arg1, Arg2, ConcatAxis>>::value_type>
+KFR_INTRINSIC vec<T, N> get_elements(const xconcatenate<Arg1, Arg2, ConcatAxis>& self,
+                                     const shape<NDims>& index, const axis_params<Axis, N>& sh)
+{
+    const index_t size1     = self.size1;
+    constexpr index_t Naxis = ConcatAxis == Axis ? N : 1;
+    if (index[ConcatAxis] >= size1[ConcatAxis])
+    {
+        shape index1 = index;
+        index1[ConcatAxis] -= size1[ConcatAxis];
+        return get_elements(std::get<1>(self.args), index1, sh);
+    }
+    else if (CMT_LIKELY(index[ConcatAxis] + Naxis <= size1[ConcatAxis]))
+    {
+        return get_elements(std::get<0>(self.args), index, sh);
+    }
+    else // (index < size1) && (index + N > size1)
+    {
+        vec<T, N> result;
+        // Here Axis == ConcatAxis
+        shape index1 = index;
+        for (index_t i = 0; i < size1[ConcatAxis] - index[ConcatAxis]; ++i)
+        {
+            result[i] = get_elements(std::get<0>(self.args), index1, axis_params<Axis, 1>{})[0];
+            ++index1[ConcatAxis];
+        }
+        index1[ConcatAxis] -= size1[ConcatAxis];
+        for (index_t i = size1[ConcatAxis] - index[ConcatAxis]; i < N; ++i)
+        {
+            result[i] = get_elements(std::get<1>(self.args), index1, axis_params<Axis, 1>{})[0];
+            ++index1[ConcatAxis];
+        }
+        return result;
+    }
+}
+
+} // namespace CMT_ARCH_NAME
+
+// ----------------------------------------------------------------------------
+
 } // namespace kfr
diff --git a/include/kfr/base/conversion.hpp b/include/kfr/base/conversion.hpp
@@ -25,7 +25,7 @@
  */
 #pragma once
 
-#include "../math/clamp.hpp"
+#include "../simd/clamp.hpp"
 #include "../simd/types.hpp"
 #include "../simd/vec.hpp"
 #include "univector.hpp"
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -131,7 +131,7 @@ using enable_if_output_expression =
                                       vec<typename expression_traits<T>::value_type, 1>{}))>;
 
 template <typename T>
-using enable_if_inout_output_expression =
+using enable_if_input_output_expression =
     std::void_t<enable_if_input_expression<T>, enable_if_output_expression<T>>;
 
 template <typename... T>
@@ -140,6 +140,27 @@ using enable_if_input_expressions = std::void_t<enable_if_input_expression<T>...
 template <typename... T>
 using enable_if_output_expressions = std::void_t<enable_if_output_expression<T>...>;
 
+template <typename... T>
+using enable_if_input_output_expressions = std::void_t<enable_if_input_output_expression<T>...>;
+
+template <typename E, typename = void>
+constexpr inline bool is_input_expression = false;
+
+template <typename E>
+constexpr inline bool is_input_expression<E, enable_if_input_expression<E>> = true;
+
+template <typename E, typename = void>
+constexpr inline bool is_output_expression = false;
+
+template <typename E>
+constexpr inline bool is_output_expression<E, enable_if_output_expression<E>> = true;
+
+template <typename E, typename = void>
+constexpr inline bool is_input_output_expression = false;
+
+template <typename E>
+constexpr inline bool is_input_output_expression<E, enable_if_input_output_expression<E>> = true;
+
 #define KFR_ACCEPT_EXPRESSIONS(...) internal_generic::expressions_check<__VA_ARGS__>* = nullptr
 
 template <typename T>
@@ -311,7 +332,7 @@ static auto process(Out&& out, In&& in, shape<outdims> start = shape<outdims>(0)
     const shape<indims> inshape   = Trin::shapeof(in);
     if (CMT_UNLIKELY(!internal_generic::can_assign_from(outshape, inshape)))
         return shape<outdims>{ 0 };
-    shape<outdims> stop = min(start.add_inf(size), outshape);
+    shape<outdims> stop = min(add_shape(start, size), outshape);
 
     index_t in_size = 0;
     if constexpr (indims > 0)
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -26,107 +26,108 @@
 #pragma once
 
 #include "../math/log_exp.hpp"
-#include "../math/select.hpp"
 #include "../math/sin_cos.hpp"
 #include "../simd/impl/function.hpp"
+#include "../simd/select.hpp"
 #include "../simd/vec.hpp"
+#include "shape.hpp"
 
 namespace kfr
 {
-inline namespace CMT_ARCH_NAME
-{
-
-namespace internal
-{
 
-template <typename T, size_t width_, typename Class>
-struct generator : input_expression
+template <typename T, size_t VecWidth, typename Class, typename Twork = T>
+struct xgenerator
 {
-    constexpr static size_t width = width_;
-    using value_type              = T;
-
-    constexpr static bool is_incremental = true;
-
-    template <typename U, size_t N>
-    friend KFR_INTRINSIC vec<U, N> get_elements(const generator& self, cinput_t, size_t index,
-                                                vec_shape<U, N> t)
-    {
-        return self.generate(t);
-    }
+    constexpr static size_t width = VecWidth;
 
     void resync(T start) const { ptr_cast<Class>(this)->sync(start); }
 
-protected:
-    void call_next() const { ptr_cast<Class>(this)->next(); }
     template <size_t N>
-    void call_shift(csize_t<N>) const
+    KFR_MEM_INTRINSIC vec<T, N> generate() const
     {
-        ptr_cast<Class>(this)->shift(csize_t<N>());
+        if constexpr (N < width)
+        {
+            const vec<T, N> result           = narrow<N>(call_get_value());
+            const vec<Twork, width> oldvalue = value;
+            call_next();
+            value = slice<N, width>(oldvalue, value);
+            return result;
+        }
+        else if (N > width)
+        {
+            const vec lo = generate(low(x));
+            const vec hi = generate(high(x));
+            return concat(lo, hi);
+        }
+        else // N == width
+        {
+            const vec<T, N> result = call_get_value();
+            call_next();
+            return result;
+        }
     }
+    mutable vec<Twork, width> value;
 
-    template <size_t N>
-    void shift(csize_t<N>) const
-    {
-        const vec<T, width> oldvalue = value;
-        call_next();
-        value = slice<N, width>(oldvalue, value);
-    }
+private:
+    KFR_MEM_INTRINSIC void call_next() const { ptr_cast<Class>(this)->next(); }
 
-    template <size_t N, KFR_ENABLE_IF(N == width)>
-    KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
-    {
-        const vec<T, N> result = value;
-        call_next();
-        return result;
-    }
+    KFR_MEM_INTRINSIC vec<T, width> call_get_value() const { return ptr_cast<Class>(this)->get_value(); }
 
-    template <size_t N, KFR_ENABLE_IF(N < width)>
-    KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
+    KFR_MEM_INTRINSIC std::enable_if_t<std::is_same_v<T, Twork>, vec<T, width>> get_value() const
     {
-        const vec<T, N> result = narrow<N>(value);
-        call_shift(csize_t<N>());
-        return result;
+        return value;
     }
+};
 
-    template <size_t N, KFR_ENABLE_IF(N > width)>
-    KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N> x) const
-    {
-        const auto lo = generate(low(x));
-        const auto hi = generate(high(x));
-        return concat(lo, hi);
-    }
+template <typename T, size_t VecWidth, typename Class>
+struct expression_traits<xgenerator<T, VecWidth, Class>> : public expression_traits_defaults
+{
+    using value_type             = T;
+    constexpr static size_t dims = 1;
+    constexpr static shape<1> shapeof(const T&) { return shape<1>(infinite_size); }
+    constexpr static shape<1> shapeof() { return shape<1>(infinite_size); }
 
-    mutable vec<T, width> value;
+    constexpr static inline bool explicit_operand = true;
+    constexpr static inline bool random_access    = false;
 };
 
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_linear : generator<T, width, generator_linear<T, width>>
+inline namespace CMT_ARCH_NAME
+{
+template <typename T, size_t VecWidth, typename Class, size_t N>
+KFR_INTRINSIC vec<T, N> get_elements(const xgenerator<T, VecWidth, Class>& self, const shape<1>& index,
+                                     const axis_params<0, N>&)
+{
+    return self.template generate<N>();
+}
+} // namespace CMT_ARCH_NAME
+
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgenlinear : public xgenerator<T, VecWidth, xgenlinear<T, VecWidth>>
 {
-    generator_linear(T start, T step) CMT_NOEXCEPT : step(step), vstep(step* width) { this->resync(start); }
+    xgenlinear(T start, T step) CMT_NOEXCEPT : step{ step }, vstep{ step * VecWidth } { sync(start); }
 
     KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
     {
-        this->value = start + enumerate<T, width>() * step;
+        this->value = start + enumerate(vec_shape<T, VecWidth>{}, vstep / VecWidth);
     }
 
     KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += vstep; }
 
-protected:
-    T step;
     T vstep;
 };
 
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_exp : generator<T, width, generator_exp<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgenexp : public xgenerator<T, VecWidth, xgenexp<T, VecWidth>>
 {
-    generator_exp(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp(make_vector(step* width))[0] - 1)
+    xgenexp(T start, T step) CMT_NOEXCEPT : step{ step },
+                                            vstep{ exp(make_vector(step * VecWidth)).front() - 1 }
     {
         this->resync(start);
     }
 
     KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
     {
-        this->value = exp(start + enumerate<T, width>() * step);
+        this->value = exp(start + enumerate<T, VecWidth>() * step);
     }
 
     KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
@@ -136,14 +137,14 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2),
-          std::enable_if_t<std::is_same<complex<deep_subtype<T>>, T>::value, int> = 0>
-struct generator_expj : generator<T, width, generator_expj<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<deep_subtype<T>> / 8 / 2>
+struct xgenexpj : public xgenerator<T, VecWidth, xgenexpj<T, VecWidth>>
 {
     using ST = deep_subtype<T>;
+    static_assert(std::is_same_v<complex<deep_subtype<T>>, T>, "xgenexpj requires complex type");
 
-    generator_expj(ST start_, ST step_)
-        : step(step_), alpha(2 * sqr(sin(width * step / 2))), beta(-sin(width * step))
+    xgenexpj(ST start_, ST step_)
+        : step(step_), alpha(2 * sqr(sin(VecWidth * step / 2))), beta(-sin(VecWidth * step))
     {
         this->resync(T(start_));
     }
@@ -156,26 +157,26 @@ struct generator_expj : generator<T, width, generator_expj<T, width>>
     }
 
 protected:
-    ST const step;
-    ST const alpha;
-    ST const beta;
-    CMT_NOINLINE static vec<T, width> init_cossin(ST w, ST phase)
+    ST step;
+    ST alpha;
+    ST beta;
+    CMT_NOINLINE static vec<T, VecWidth> init_cossin(ST w, ST phase)
     {
         return ccomp(cossin(dup(phase + enumerate<ST, width>() * w)));
     }
 };
 
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgenexp2 : public xgenerator<T, VecWidth, xgenexp2<T, VecWidth>>
 {
-    generator_exp2(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp2(make_vector(step* width))[0] - 1)
+    xgenexp2(T start, T step) CMT_NOEXCEPT : step{ step }, vstep{ exp2(make_vector(step * VecWidth))[0] - 1 }
     {
         this->resync(start);
     }
 
     KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
     {
-        this->value = exp2(start + enumerate<T, width>() * step);
+        this->value = exp2(start + enumerate(vec_shape<T, VecWidth>, step));
     }
 
     KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
@@ -185,11 +186,12 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_cossin : generator<T, width, generator_cossin<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgencossin : public xgenerator<T, VecWidth, xgencossin<T, VecWidth>>
 {
-    generator_cossin(T start, T step)
-        : step(step), alpha(2 * sqr(sin(width / 2 * step / 2))), beta(-sin(width / 2 * step))
+    static_assert(VecWidth % 2 == 0);
+    xgencossin(T start, T step)
+        : step(step), alpha(2 * sqr(sin(VecWidth / 2 * step / 2))), beta(-sin(VecWidth / 2 * step))
     {
         this->resync(start);
     }
@@ -204,56 +206,50 @@ protected:
     T step;
     T alpha;
     T beta;
-    CMT_NOINLINE static vec<T, width> init_cossin(T w, T phase)
+    CMT_NOINLINE static vec<T, VecWidth> init_cossin(T w, T phase)
     {
-        return cossin(dup(phase + enumerate<T, width / 2>() * w));
+        return cossin(dup(phase + enumerate(vec_shape<T, VecWidth / 2>, w)));
     }
 };
 
-template <typename T, size_t width = vector_width<T>* bitness_const(2, 4)>
-struct generator_sin : generator<T, width, generator_sin<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8 / 2>
+struct xgensin : public xgenerator<T, VecWidth, xgensin<T, VecWidth>, vec<T, 2>>
 {
-    generator_sin(T start, T step)
-        : step(step), alpha(2 * sqr(sin(width * step / 2))), beta(sin(width * step))
+    xgensin(T start, T step)
+        : step(step), alpha(2 * sqr(sin(VecWidth * step / 2))), beta(sin(VecWidth * step))
     {
         this->resync(start);
     }
     KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
     {
-        const vec<T, width* 2> cs = splitpairs(cossin(dup(start + enumerate<T, width>() * step)));
-        this->cos_value           = low(cs);
-        this->value               = high(cs);
+        const vec<T, 2 * VecWidth> cs = cossin(dup(start + enumerate(vec_shape<T, VecWidth>, step)));
+        this->value                   = vec<vec<T, 2>, VecWidth>::from_flatten(cs);
     }
 
     KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT
     {
-        const vec<T, width> c = this->cos_value;
-        const vec<T, width> s = this->value;
+        const vec<T, 2 * VecWidth> cs = flatten(this->value);
 
-        const vec<T, width> cc = alpha * c + beta * s;
-        const vec<T, width> ss = alpha * s - beta * c;
+        cs = cs - addsub(alpha * cs, beta * swap<2>(cs));
 
-        this->cos_value = c - cc;
-        this->value     = s - ss;
-    }
+        this->value = vec<vec<T, 2>, VecWidth>::from_flatten(cs);
 
-    template <size_t N>
-    void shift(csize_t<N>) const CMT_NOEXCEPT
-    {
-        const vec<T, width> oldvalue    = this->value;
-        const vec<T, width> oldcosvalue = this->cos_value;
-        next();
-        this->value     = slice<N, width>(oldvalue, this->value);
-        this->cos_value = slice<N, width>(oldcosvalue, this->cos_value);
+        // const vec<T, VecWidth> c = even(flatten(this->value));
+        // const vec<T, VecWidth> s = odd(flatten(this->value));
+        // const vec<T, VecWidth> cc = alpha * c + beta * s;
+        // const vec<T, VecWidth> ss = alpha * s - beta * c;
+        // this->cos_value = c - cc;
+        // this->value     = s - ss;
     }
 
 protected:
     T step;
     T alpha;
     T beta;
-    mutable vec<T, width> cos_value;
 };
-} // namespace internal
+
+inline namespace CMT_ARCH_NAME
+{
 
 /**
  * @brief Returns template expression that generates values starting from the start and using the step as the
@@ -264,9 +260,9 @@ protected:
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step)
+KFR_FUNCTION xgenlinear<TF> gen_linear(T1 start, T2 step)
 {
-    return internal::generator_linear<TF>(start, step);
+    return xgenlinear<TF>(start, step);
 }
 
 /**
@@ -276,9 +272,9 @@ KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step)
+KFR_FUNCTION xgenexp<TF> gen_exp(T1 start, T2 step)
 {
-    return internal::generator_exp<TF>(start, step);
+    return xgenexp<TF>(start, step);
 }
 
 /**
@@ -288,9 +284,9 @@ KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = complex<ftype<common_type<T1, T2>>>>
-KFR_FUNCTION internal::generator_expj<TF> gen_expj(T1 start, T2 step)
+KFR_FUNCTION xgenexpj<TF> gen_expj(T1 start, T2 step)
 {
-    return internal::generator_expj<TF>(start, step);
+    return xgenexpj<TF>(start, step);
 }
 
 /**
@@ -300,9 +296,9 @@ KFR_FUNCTION internal::generator_expj<TF> gen_expj(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
+KFR_FUNCTION xgenexp2<TF> gen_exp2(T1 start, T2 step)
 {
-    return internal::generator_exp2<TF>(start, step);
+    return xgenexp2<TF>(start, step);
 }
 
 /**
@@ -316,9 +312,9 @@ KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
+KFR_FUNCTION xgencossin<TF> gen_cossin(T1 start, T2 step)
 {
-    return internal::generator_cossin<TF>(start, step);
+    return xgencossin<TF>(start, step);
 }
 
 /**
@@ -328,9 +324,9 @@ KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_sin<TF> gen_sin(T1 start, T2 step)
+KFR_FUNCTION xgensin<TF> gen_sin(T1 start, T2 step)
 {
-    return internal::generator_sin<TF>(start, step);
+    return xgensin<TF>(start, step);
 }
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp
@@ -38,6 +38,15 @@ struct expression_pointer;
 template <typename T>
 constexpr size_t maximum_expression_width = vector_width_for<T, cpu_t::highest> * 2;
 
+template <typename T, template <T...> typename Tpl, typename Pack>
+struct expand_cvals;
+
+template <typename T, template <T...> typename Tpl, T... vals>
+struct expand_cvals<T, Tpl, cvals_t<T, vals...>>
+{
+    using type = Tpl<vals...>;
+};
+
 inline namespace CMT_ARCH_NAME
 {
 
@@ -50,64 +59,57 @@ KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& n
 }
 } // namespace CMT_ARCH_NAME
 
-template <typename T, size_t N = maximum_expression_width<T>>
-struct expression_vtable : expression_vtable<T, N / 2>
+template <typename T, index_t Dims>
+struct expression_vtable
 {
-    using func_get = void (*)(void*, size_t, vec<T, N>&);
-    func_get get;
+    constexpr static const size_t Nsizes = 1 + ilog2(maximum_expression_width<T>);
 
-    template <typename Expression>
-    expression_vtable(ctype_t<Expression> t)
-        : expression_vtable<T, N / 2>(t), get(&expression_vtable<T, N>::template static_get<Expression>)
-    {
-    }
+    using func_get        = void (*)(void*, shape<Dims>, T*);
+    using func_set        = void (*)(void*, shape<Dims>, const T*);
+    using func_shapeof    = void (*)(void*, shape<Dims>&);
+    using func_substitute = bool (*)(void*, expression_pointer<T>&&);
 
-    template <typename Expression>
-    static void static_get(void* instance, size_t index, vec<T, N>& result)
-    {
-        result = get_elements(*static_cast<Expression*>(instance), cinput, index, vec_shape<T, N>());
-    }
-};
-
-template <typename T>
-struct expression_vtable<T, 0>
-{
-    using func_size        = size_t (*)(void* p);
-    using func_begin_block = void (*)(void*, size_t);
-    using func_end_block   = void (*)(void*, size_t);
-    using func_substitute  = bool (*)(void*, expression_pointer<T>&&);
-
-    func_size size;
-    func_begin_block begin_block;
-    func_end_block end_block;
-    func_substitute substitute;
+    func_shapeof fn_shapeof;
+    func_substitute fn_substitute;
+    std::array<std::array<func_get, Nsizes>, Dims> fn_get_elements;
+    std::array<std::array<func_set, Nsizes>, Dims> fn_set_elements;
 
     template <typename Expression>
-    expression_vtable(ctype_t<Expression>)
-        : size(&expression_vtable<T, 0>::template static_size<Expression>),
-          begin_block(&expression_vtable<T, 0>::template static_begin_block<Expression>),
-          end_block(&expression_vtable<T, 0>::template static_end_block<Expression>),
-          substitute(&expression_vtable<T, 0>::template static_substitute<Expression>)
+    expression_vtable(ctype_t<Expression> t)
     {
+        fn_shapeof    = &static_shapeof<Expression>;
+        fn_substitute = &static_substitute<Expression>;
+        cforeach(csizeseq<Nsizes>,
+                 [&](size_t size) CMT_INLINE_LAMBDA
+                 {
+                     cforeach(csizeseq<Dims>,
+                              [&](size_t axis) CMT_INLINE_LAMBDA
+                              {
+                                  fn_get_elements[axis][size] =
+                                      &expression_vtable::static_get_elements<Expression, 1 << size, axis>;
+                                  fn_set_elements[axis][size] =
+                                      &expression_vtable::static_set_elements<Expression, 1 << size, axis>;
+                              });
+                 });
     }
 
-    template <typename Expression>
-    static size_t static_size(void* instance)
+    template <typename Expression, size_t N, index_t VecAxis>
+    static void static_get_elements(void* instance, shape<Dims> index, T* dest)
     {
-        return static_cast<Expression*>(instance)->size();
+        write(dest, get_elements(*static_cast<Expression*>(instance), index, axis_params_v<VecAxis, N>));
     }
-    template <typename Expression>
-    static void static_begin_block(void* instance, size_t size)
+    template <typename Expression, size_t N, index_t VecAxis>
+    static void static_set_elements(void* instance, shape<Dims> index, const T* src)
     {
-        return static_cast<Expression*>(instance)->begin_block(cinput, size);
+        set_elements(*static_cast<Expression*>(instance), index, axis_params_v<VecAxis, N>, read<N>(src));
     }
     template <typename Expression>
-    static void static_end_block(void* instance, size_t size)
+    static void static_shapeof(void* instance, shape<Dims>& result)
     {
-        return static_cast<Expression*>(instance)->end_block(cinput, size);
+        result = expression_traits<Expression>::shapeof(*static_cast<Expression*>(instance));
     }
     template <typename Expression>
-    static bool static_substitute(void* instance, expression_pointer<T>&& ptr)
+    static bool static_substitute(void* instance, expression_pointer<T> ptr)
     {
         return internal::invoke_substitute(*static_cast<Expression*>(instance), std::move(ptr));
     }
@@ -143,51 +145,77 @@ KFR_INTRINSIC std::shared_ptr<expression_resource> make_resource(E&& e)
         new (aligned_allocate<T>()) T(std::move(e)), [](T* pi) { aligned_deallocate<T>(pi); }));
 }
 
-template <typename T, bool enable_resource>
-struct expression_pointer : input_expression
+template <typename T, index_t Dims>
+struct xpointer
 {
-    using value_type = T;
+    void* instance;
+    const expression_vtable<T, Dims>* vtable;
+    std::shared_ptr<expression_resource> resource;
 
-    expression_pointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {}
-    expression_pointer(const void* instance, const expression_vtable<T>* vtable,
-                       std::shared_ptr<expression_resource> resource = nullptr)
+    xpointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {}
+    xpointer(const void* instance, const expression_vtable<T, Dims>* vtable,
+             std::shared_ptr<expression_resource> resource = nullptr)
         : instance(const_cast<void*>(instance)), vtable(vtable), resource(std::move(resource))
     {
     }
-    template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width<T>)>
-    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t, size_t index,
-                                                vec_shape<T, N>)
+
+    explicit operator bool() const { return instance != nullptr; }
+};
+
+template <typename T, index_t Dims>
+struct expression_traits<xpointer<T, Dims>> : expression_traits_defaults
+{
+    using value_type             = T;
+    constexpr static size_t dims = Dims;
+    constexpr static shape<dims> shapeof(const xpointer<T, Dims>& self)
     {
-        static_assert(is_poweroftwo(N), "N must be a power of two");
-        vec<T, N> result;
-        static_cast<const expression_vtable<T, N>*>(self.vtable)->get(self.instance, index, result);
-        return result;
+        shape<dims> result;
+        self.vtable->fn_shapeof(self.instance, result);
     }
-    template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width<T>)>
-    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t cinput, size_t index,
-                                                vec_shape<T, N>)
+    constexpr static shape<dims> shapeof() { return shape<dims>(undefined_size); }
+
+    constexpr static inline bool random_access = false;
+};
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, index_t NDims, index_t Axis, size_t N>
+KFR_INTRINSIC vec<T, N> get_elements(const xpointer<T, NDims>& self, const shape<NDims>& index,
+                                     const axis_params<Axis, N>& sh)
+{
+    static_assert(is_poweroftwo(N) && N >= 1);
+    if constexpr (N > expression_vtable<T, NDims>::Nmax)
     {
-        static_assert(is_poweroftwo(N), "N must be a power of two");
-        const vec<T, N / 2> r1 = get_elements(self, cinput, index, vec_shape<T, N / 2>());
-        const vec<T, N / 2> r2 = get_elements(self, cinput, index + N / 2, vec_shape<T, N / 2>());
-        return concat(r1, r2);
+        constexpr size_t Nhalf = N / 2;
+        return concat(get_elements(self, index, axis_params_v<Axis, Nhalf>),
+                      get_elements(self, index.add_at(Axis, Nhalf), axis_params_v<Axis, Nhalf>));
     }
-    KFR_MEM_INTRINSIC void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); }
-    KFR_MEM_INTRINSIC void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); }
-    KFR_MEM_INTRINSIC size_t size() const { return vtable->size(instance); }
-
-    KFR_MEM_INTRINSIC bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const
+    else
     {
-        return vtable->substitute(instance, std::move(new_pointer));
+        portable_vec<T, N> result;
+        self.vtable->fn_get_elements[Axis][ilog2(N)](self.instance, index, result.elem);
+        return result;
     }
+}
 
-    explicit operator bool() const { return instance != nullptr; }
-
-private:
-    void* instance;
-    const expression_vtable<T>* vtable;
-    std::shared_ptr<expression_resource> resource;
-};
+template <typename T, index_t NDims, index_t Axis, size_t N>
+KFR_INTRINSIC void set_elements(const xpointer<T, NDims>& self, const shape<NDims>& index,
+                                const axis_params<Axis, N>& sh, const identity<vec<T, N>>& value)
+{
+    static_assert(is_poweroftwo(N) && N >= 1);
+    if constexpr (N > expression_vtable<T, NDims>::Nmax)
+    {
+        constexpr size_t Nhalf = N / 2;
+        set_elements(self, index, axis_params_v<Axis, Nhalf>, slice<0, Nhalf>(value));
+        set_elements(self, index.add_at(Axis, Nhalf), axis_params_v<Axis, Nhalf>, slice<Nhalf, Nhalf>(value));
+    }
+    else
+    {
+        self.vtable->fn_set_elements[Axis][ilog2(N)](self.instance, index, &value.front());
+    }
+}
+} // namespace CMT_ARCH_NAME
 
 inline namespace CMT_ARCH_NAME
 {
@@ -195,11 +223,10 @@ inline namespace CMT_ARCH_NAME
 namespace internal
 {
 
-template <typename T, typename E>
-KFR_INTRINSIC expression_vtable<T>* make_expression_vtable()
+template <typename T, index_t Dims, typename E>
+KFR_INTRINSIC expression_vtable<T, Dims>* make_expression_vtable()
 {
-    static_assert(is_input_expression<E>, "E must be an expression");
-    static expression_vtable<T> vtable{ ctype_t<decay<E>>{} };
+    static expression_vtable<T, Dims> vtable{ ctype_t<decay<E>>{} };
     return &vtable;
 }
 } // namespace internal
@@ -208,26 +235,25 @@ KFR_INTRINSIC expression_vtable<T>* make_expression_vtable()
  *  This overload takes reference to the expression.
  *  @warning Use with caution with local variables.
  */
-template <typename E, typename T = value_type_of<E>>
-KFR_INTRINSIC expression_pointer<T> to_pointer(E& expr)
+template <typename E, typename T = expression_value_type<E>, index_t Dims = expression_dims<E>>
+KFR_INTRINSIC expression_pointer<T, Dims> to_pointer(E& expr)
 {
-    static_assert(is_input_expression<E>, "E must be an expression");
-    return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, E>());
+    return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, Dims, E>());
 }
 
 /** @brief Converts the given expression into an opaque object.
  *  This overload takes ownership of the expression (Move semantics).
  *  @note Use std::move to force use of this overload.
  */
-template <typename E, typename T = value_type_of<E>>
-KFR_INTRINSIC expression_pointer<T> to_pointer(E&& expr)
+template <typename E, typename T = expression_value_type<E>, index_t Dims = expression_dims<E>>
+KFR_INTRINSIC expression_pointer<T, Dims> to_pointer(E&& expr)
 {
-    static_assert(is_input_expression<E>, "E must be an expression");
     std::shared_ptr<expression_resource> ptr = make_resource(std::move(expr));
     void* instance                           = ptr->instance();
-    return expression_pointer<T>(instance, internal::make_expression_vtable<T, E>(), std::move(ptr));
+    return expression_pointer<T, Dims>(instance, internal::make_expression_vtable<T, E>(), std::move(ptr));
 }
 
+#if 0
 template <typename T, size_t key>
 class expression_placeholder : public input_expression
 {
@@ -306,5 +332,7 @@ KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& n
 }
 
 } // namespace internal
+#endif
 } // namespace CMT_ARCH_NAME
-} // namespace kfr
+
+}
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -25,92 +25,14 @@
  */
 #pragma once
 
-#include "../simd/impl/function.hpp"
-#include "../simd/operators.hpp"
-#include "../simd/shuffle.hpp"
-#include "../simd/vec.hpp"
-#include "expression.hpp"
-#include <functional>
-
-#ifdef CMT_ARCH_ARM
-#define KFR_DISABLE_READCYCLECOUNTER
-#endif
+#include "random_bits.hpp"
 
 namespace kfr
 {
 
-#ifndef KFR_DISABLE_READCYCLECOUNTER
-struct seed_from_rdtsc_t
-{
-};
-
-constexpr seed_from_rdtsc_t seed_from_rdtsc{};
-#endif
-
 inline namespace CMT_ARCH_NAME
 {
 
-using random_state = u32x4;
-
-#ifndef KFR_DISABLE_READCYCLECOUNTER
-#ifdef CMT_COMPILER_CLANG
-#define KFR_builtin_readcyclecounter()                                                                       \
-    static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here
-#else
-#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc())
-#endif
-#endif
-
-struct random_bit_generator
-{
-#ifndef KFR_DISABLE_READCYCLECOUNTER
-    KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT
-        : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(),
-                                         (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
-    {
-        (void)operator()();
-    }
-#endif
-    KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT
-        : state(x0, x1, x2, x3)
-    {
-        (void)operator()();
-    }
-    KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT
-        : state(bitcast<u32>(make_vector(x0, x1)))
-    {
-        (void)operator()();
-    }
-
-    KFR_MEM_INTRINSIC random_state operator()()
-    {
-        const static random_state mul{ 214013u, 17405u, 214013u, 69069u };
-        const static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
-        state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add))));
-        return state;
-    }
-
-protected:
-    random_state state;
-};
-
-static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16");
-
-template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))>
-KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
-{
-    return narrow<N>(bitcast<u8>(gen()));
-}
-
-template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
-KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
-{
-    constexpr size_t N2         = prev_poweroftwo(N - 1);
-    const vec<u8, N2> bits1     = random_bits<N2>(gen);
-    const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen);
-    return concat(bits1, bits2);
-}
-
 template <typename T, size_t N, KFR_ENABLE_IF(is_integral<T>)>
 KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen)
 {
diff --git a/include/kfr/base/random_bits.hpp b/include/kfr/base/random_bits.hpp
@@ -0,0 +1,114 @@
+/** @addtogroup random
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/vec.hpp"
+#include "expression.hpp"
+#include <functional>
+
+#ifdef CMT_ARCH_ARM
+#define KFR_DISABLE_READCYCLECOUNTER
+#endif
+
+namespace kfr
+{
+
+#ifndef KFR_DISABLE_READCYCLECOUNTER
+struct seed_from_rdtsc_t
+{
+};
+
+constexpr seed_from_rdtsc_t seed_from_rdtsc{};
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+using random_state = u32x4;
+
+#ifndef KFR_DISABLE_READCYCLECOUNTER
+#ifdef CMT_COMPILER_CLANG
+#define KFR_builtin_readcyclecounter()                                                                       \
+    static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here
+#else
+#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc())
+#endif
+#endif
+
+struct random_bit_generator
+{
+#ifndef KFR_DISABLE_READCYCLECOUNTER
+    KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT
+        : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(),
+                                         (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
+    {
+        (void)operator()();
+    }
+#endif
+    KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT
+        : state(x0, x1, x2, x3)
+    {
+        (void)operator()();
+    }
+    KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT
+        : state(bitcast<u32>(make_vector(x0, x1)))
+    {
+        (void)operator()();
+    }
+
+    KFR_MEM_INTRINSIC random_state operator()()
+    {
+        const static random_state mul{ 214013u, 17405u, 214013u, 69069u };
+        const static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
+        state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add))));
+        return state;
+    }
+
+protected:
+    random_state state;
+};
+
+static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16");
+
+template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))>
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
+{
+    return narrow<N>(bitcast<u8>(gen()));
+}
+
+template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
+{
+    constexpr size_t N2         = prev_poweroftwo(N - 1);
+    const vec<u8, N2> bits1     = random_bits<N2>(gen);
+    const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen);
+    return concat(bits1, bits2);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp
@@ -25,7 +25,7 @@
  */
 #pragma once
 
-#include "../math/min_max.hpp"
+#include "../simd/min_max.hpp"
 #include "../simd/horizontal.hpp"
 #include "../simd/impl/function.hpp"
 #include "../simd/operators.hpp"
diff --git a/include/kfr/base/shape.hpp b/include/kfr/base/shape.hpp
@@ -27,7 +27,7 @@
 
 #include "impl/static_array.hpp"
 
-#include "../math/min_max.hpp"
+#include "../simd/min_max.hpp"
 #include "../simd/shuffle.hpp"
 #include "../simd/types.hpp"
 #include "../simd/vec.hpp"
@@ -60,6 +60,8 @@ constexpr inline cindex_t<val> cindex{};
 
 constexpr inline index_t infinite_size = max_index_t;
 
+constexpr inline index_t undefined_size = 0;
+
 constexpr inline index_t maximum_dims = 8;
 
 using dimset = vec<i8, maximum_dims>;
@@ -142,20 +144,36 @@ struct shape : static_array_base<index_t, csizeseq_t<dims>>
         return false;
     }
 
-    shape add_inf(const shape& other) const
+    friend shape add_shape(const shape& lhs, const shape& rhs)
     {
-        vec<index_t, dims> x    = **this;
-        vec<index_t, dims> y    = *other;
-        mask<index_t, dims> inf = (x == infinite_size) || (y == infinite_size);
+        vec<index_t, dims> x    = *lhs;
+        vec<index_t, dims> y    = *rhs;
+        mask<index_t, dims> inf = max(x, y) == infinite_size;
         return select(inf, infinite_size, x + y);
     }
-    shape sub_inf(const shape& other) const
+    friend shape sub_shape(const shape& lhs, const shape& rhs)
     {
-        vec<index_t, dims> x    = **this;
-        vec<index_t, dims> y    = *other;
-        mask<index_t, dims> inf = (x == infinite_size) || (y == infinite_size);
+        vec<index_t, dims> x    = *lhs;
+        vec<index_t, dims> y    = *rhs;
+        mask<index_t, dims> inf = max(x, y) == infinite_size;
         return select(inf, infinite_size, x - y);
     }
+    friend shape add_shape_undef(const shape& lhs, const shape& rhs)
+    {
+        vec<index_t, dims> x      = *lhs;
+        vec<index_t, dims> y      = *rhs;
+        mask<index_t, dims> inf   = max(x, y) == infinite_size;
+        mask<index_t, dims> undef = min(x, y) == undefined_size;
+        return select(inf, infinite_size, select(undef, undefined_size, x + y));
+    }
+    friend shape sub_shape_undef(const shape& lhs, const shape& rhs)
+    {
+        vec<index_t, dims> x      = *lhs;
+        vec<index_t, dims> y      = *rhs;
+        mask<index_t, dims> inf   = max(x, y) == infinite_size;
+        mask<index_t, dims> undef = min(x, y) == undefined_size;
+        return select(inf, infinite_size, select(undef, undefined_size, x - y));
+    }
 
     friend shape min(const shape& x, const shape& y) { return kfr::min(*x, *y); }
 
diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp
@@ -1,103 +0,0 @@
-/** @addtogroup utility
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../math/min_max.hpp"
-#include "../simd/shuffle.hpp"
-#include "../simd/vec.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-/**
- * @brief Sort the elements in the vector in ascending order
- * @param x input vector
- * @return sorted vector
- * @code
- * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(-10, 1, 2, 1000));
- * @endcode
- */
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x)
-{
-    constexpr size_t Nhalf = N / 2;
-    vec<T, Nhalf> e        = low(x);
-    vec<T, Nhalf> o        = high(x);
-    constexpr auto blend0  = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
-    for (size_t i = 0; i < Nhalf; i++)
-    {
-        vec<T, Nhalf> t;
-        t = min(e, o);
-        o = max(e, o);
-        o = rotateright<1>(o);
-        e = t;
-        t = max(e, o);
-        o = min(e, o);
-        e = t;
-        t = blend(e, o, blend0);
-        o = blend(o, e, blend0);
-        o = rotateleft<1>(o);
-        e = t;
-    }
-    return interleavehalves(concat(e, o));
-}
-
-/**
- * @brief Sort the elements in the vector in descending order
- * @param x input vector
- * @return sorted vector
- * @code
- * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(1000, 2, 1, -10));
- * @endcode
- */
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x)
-{
-    constexpr size_t Nhalf = N / 2;
-    vec<T, Nhalf> e        = low(x);
-    vec<T, Nhalf> o        = high(x);
-    constexpr auto blend0  = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
-    for (size_t i = 0; i < Nhalf; i++)
-    {
-        vec<T, Nhalf> t;
-        t = max(e, o);
-        o = min(e, o);
-        o = rotateright<1>(o);
-        e = t;
-        t = min(e, o);
-        o = max(e, o);
-        e = t;
-        t = blend(e, o, blend0);
-        o = blend(o, e, blend0);
-        o = rotateleft<1>(o);
-        e = t;
-    }
-    return interleavehalves(concat(e, o));
-}
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/base/tensor.hpp b/include/kfr/base/tensor.hpp
@@ -29,8 +29,8 @@
 
 #include "../cometa/array.hpp"
 
-#include "../math/logical.hpp"
-#include "../math/min_max.hpp"
+#include "../simd/logical.hpp"
+#include "../simd/min_max.hpp"
 #include "../simd/horizontal.hpp"
 #include "../simd/impl/function.hpp"
 #include "../simd/read_write.hpp"
@@ -914,7 +914,7 @@ struct expression_traits<tensor<T, Dims>> : expression_traits_defaults
     {
         return self.shape();
     }
-    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ 0 }; }
+    KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ undefined_size }; }
 };
 
 inline namespace CMT_ARCH_NAME
diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp
@@ -24,22 +24,15 @@
 
 #include "simd.hpp"
 
-#include "math/abs.hpp"
 #include "math/asin_acos.hpp"
 #include "math/atan.hpp"
-#include "math/clamp.hpp"
 #include "math/compiletime.hpp"
 #include "math/complex_math.hpp"
 #include "math/gamma.hpp"
 #include "math/hyperbolic.hpp"
 #include "math/interpolation.hpp"
 #include "math/log_exp.hpp"
-#include "math/logical.hpp"
-#include "math/min_max.hpp"
 #include "math/modzerobessel.hpp"
-#include "math/round.hpp"
-#include "math/saturation.hpp"
-#include "math/select.hpp"
 #include "math/sin_cos.hpp"
 #include "math/sqrt.hpp"
 #include "math/tan.hpp"
diff --git a/include/kfr/math/complex_math.hpp b/include/kfr/math/complex_math.hpp
@@ -26,12 +26,12 @@
 #pragma once
 
 #include "../simd/complex.hpp"
-#include "abs.hpp"
+#include "../simd/abs.hpp"
 #include "atan.hpp"
 #include "hyperbolic.hpp"
 #include "log_exp.hpp"
-#include "min_max.hpp"
-#include "select.hpp"
+#include "../simd/min_max.hpp"
+#include "../simd/select.hpp"
 #include "sin_cos.hpp"
 #include "sqrt.hpp"
 
diff --git a/include/kfr/math/impl/abs.hpp b/include/kfr/math/impl/abs.hpp
@@ -1,138 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/select.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
-{
-    return x & special_constants<T>::invhighbitmask();
-}
-
-KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT
-{
-    const __m128i sh  = _mm_srai_epi32(x.v, 31);
-    const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
-    return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk);
-}
-KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); }
-KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); }
-KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); }
-KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; }
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT
-{
-    const __m256i sh  = _mm256_srai_epi32(x.v, 31);
-    const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
-    return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk);
-}
-KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); }
-KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); }
-KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); }
-KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); }
-KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); }
-KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); }
-KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); }
-KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; }
-#endif
-
-KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); }
-KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); }
-KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); }
-#if defined CMT_ARCH_NEON64
-KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); }
-#else
-KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); }
-#endif
-
-KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; }
-
-KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); }
-#if defined CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); }
-#else
-KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT
-{
-    return x & special_constants<f64>::invhighbitmask();
-}
-#endif
-
-KFR_HANDLE_ALL_SIZES_1(abs)
-
-#else
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
-{
-    return x & special_constants<T>::invhighbitmask();
-}
-
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
-{
-    return select(x >= T(0), x, -x);
-}
-#endif
-KFR_HANDLE_SCALAR(abs)
-} // namespace intrinsics
-
-KFR_I_FN(abs)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/asin_acos.hpp b/include/kfr/math/impl/asin_acos.hpp
@@ -22,10 +22,10 @@
  */
 #pragma once
 
-#include "../../math/atan.hpp"
-#include "../../math/select.hpp"
-#include "../../math/sqrt.hpp"
 #include "../../simd/impl/function.hpp"
+#include "../../simd/select.hpp"
+#include "../atan.hpp"
+#include "../sqrt.hpp"
 
 namespace kfr
 {
diff --git a/include/kfr/math/impl/atan.hpp b/include/kfr/math/impl/atan.hpp
@@ -21,9 +21,9 @@
   See https://www.kfrlib.com for details.
  */
 #pragma once
-#include "../../math/abs.hpp"
-#include "../../math/select.hpp"
-#include "../../math/sin_cos.hpp"
+#include "../../simd/abs.hpp"
+#include "../../simd/select.hpp"
+#include "../sin_cos.hpp"
 #include "../../simd/constants.hpp"
 #include "../../simd/impl/function.hpp"
 #include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/clamp.hpp b/include/kfr/math/impl/clamp.hpp
@@ -1,55 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/min_max.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-template <typename T>
-KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi)
-{
-    return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
-{
-    return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
-{
-    return max(min(x, hi), zerovector<T, N>());
-}
-} // namespace intrinsics
-KFR_I_FN(clamp)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/hyperbolic.hpp b/include/kfr/math/impl/hyperbolic.hpp
@@ -22,10 +22,10 @@
  */
 #pragma once
 
-#include "../../math/abs.hpp"
-#include "../../math/log_exp.hpp"
-#include "../../math/min_max.hpp"
-#include "../../math/select.hpp"
+#include "../../simd/abs.hpp"
+#include "../log_exp.hpp"
+#include "../../simd/min_max.hpp"
+#include "../../simd/select.hpp"
 #include "../../simd/constants.hpp"
 #include "../../simd/impl/function.hpp"
 #include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/log_exp.hpp b/include/kfr/math/impl/log_exp.hpp
@@ -22,11 +22,11 @@
  */
 #pragma once
 
-#include "../../math/abs.hpp"
-#include "../../math/clamp.hpp"
-#include "../../math/min_max.hpp"
-#include "../../math/round.hpp"
-#include "../../math/select.hpp"
+#include "../../simd/abs.hpp"
+#include "../../simd/clamp.hpp"
+#include "../../simd/min_max.hpp"
+#include "../../simd/round.hpp"
+#include "../../simd/select.hpp"
 #include "../../simd/constants.hpp"
 #include "../../simd/impl/function.hpp"
 #include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp
@@ -1,284 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/abs.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-#if defined CMT_ARCH_SSE41
-
-// horizontal OR
-KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); }
-
-// horizontal AND
-KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-#endif
-
-#if defined CMT_ARCH_AVX
-// horizontal OR
-KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); }
-
-KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
-
-KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-
-// horizontal AND
-KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
-
-KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
-
-KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-
-#if defined CMT_ARCH_AVX512
-// horizontal OR
-KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); }
-
-// horizontal AND
-KFR_INTRINSIC bool bittestall(const mf32avx512& x)
-{
-    return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v));
-}
-KFR_INTRINSIC bool bittestall(const mf64avx512& x)
-{
-    return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v));
-}
-KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
-
-#endif
-
-#elif defined CMT_ARCH_SSE41
-KFR_INTRINSIC bool bittestany(const mf32sse& x)
-{
-    return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
-}
-KFR_INTRINSIC bool bittestany(const mf64sse& x)
-{
-    return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
-}
-KFR_INTRINSIC bool bittestall(const mf32sse& x)
-{
-    return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
-}
-KFR_INTRINSIC bool bittestall(const mf64sse& x)
-{
-    return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
-}
-#endif
-
-#if !defined CMT_ARCH_SSE41
-
-KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); }
-KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); }
-KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); }
-
-KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); }
-KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); }
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
-    return bittestall(expand_simd(a, bit<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
-    return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
-    return bittestany(expand_simd(a, bit<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
-    return bittestany(low(a)) || bittestany(high(a));
-}
-
-#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC bool bittestall(const mu32neon& a)
-{
-    const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
-    return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
-}
-
-KFR_INTRINSIC bool bittestany(const mu32neon& a)
-{
-    const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
-    return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
-}
-KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-
-KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
-    return bittestall(expand_simd(a, bit<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
-    return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
-    return bittestany(expand_simd(a, bit<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
-    return bittestany(low(a)) || bittestany(high(a));
-}
-
-#else
-
-template <typename T, size_t N>
-KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x)
-{
-    typename bitmask<N>::type val = 0;
-    for (size_t i = 0; i < N; i++)
-    {
-        val |= static_cast<int>(x[i]) << i;
-    }
-    return val;
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& x)
-{
-    return getmask(x).value;
-}
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y)
-{
-    return bittestany(x & y);
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& x)
-{
-    return !getmask(~x).value;
-}
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y)
-{
-    return !bittestany(~x & y);
-}
-#endif
-} // namespace intrinsics
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/min_max.hpp b/include/kfr/math/impl/min_max.hpp
@@ -1,236 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/abs.hpp"
-#include "../../math/select.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); }
-KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); }
-KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); }
-KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); }
-
-KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); }
-KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); }
-KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); }
-KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); }
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); }
-
-KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); }
-
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); }
-KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); }
-
-KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); }
-KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); }
-KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); }
-KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); }
-
-KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); }
-KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); }
-
-KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); }
-KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); }
-KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); }
-KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); }
-#else
-KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
-KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
-#endif
-
-#if defined CMT_ARCH_AVX
-KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); }
-KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); }
-#endif
-
-#if defined CMT_ARCH_SSE41
-KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); }
-KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); }
-KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); }
-
-KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); }
-KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); }
-KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); }
-#else
-KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
-
-KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
-
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); }
-KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); }
-KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); }
-KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); }
-KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); }
-KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); }
-KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
-
-KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); }
-KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); }
-KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); }
-KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); }
-KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); }
-KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); }
-KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
-
-KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); }
-KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); }
-#if defined CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); }
-KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); }
-#else
-KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
-KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return select(x < y, x, y);
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return select(x > y, x, y);
-}
-#endif
-
-template <typename T>
-KFR_INTRINSIC T min(initialvalue<T>)
-{
-    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
-                                                : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_INTRINSIC T max(initialvalue<T>)
-{
-    return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
-                                                : std::numeric_limits<T>::min();
-}
-template <typename T>
-KFR_INTRINSIC T absmin(initialvalue<T>)
-{
-    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
-                                                : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_INTRINSIC T absmax(initialvalue<T>)
-{
-    return 0;
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return min(abs(x), abs(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return max(abs(x), abs(y));
-}
-
-KFR_HANDLE_SCALAR(min)
-KFR_HANDLE_SCALAR(max)
-KFR_HANDLE_SCALAR(absmin)
-KFR_HANDLE_SCALAR(absmax)
-} // namespace intrinsics
-KFR_I_FN(min)
-KFR_I_FN(max)
-KFR_I_FN(absmin)
-KFR_I_FN(absmax)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/round.hpp b/include/kfr/math/impl/round.hpp
@@ -1,282 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-#include "abs.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ss(V)                                                                            \
-    _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_sd(V)                                                                            \
-    _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
-#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
-
-#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); }
-KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); }
-KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); }
-KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); }
-KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); }
-KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); }
-KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); }
-KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); }
-KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); }
-KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); }
-
-#if defined CMT_ARCH_AVX
-
-KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); }
-KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); }
-KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); }
-KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); }
-KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); }
-KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); }
-KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); }
-KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); }
-KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); }
-KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); }
-
-#endif
-
-#if defined CMT_ARCH_AVX512
-
-KFR_INTRINSIC f32avx512 floor(const f32avx512& value)
-{
-    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 ceil(const f32avx512& value)
-{
-    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 trunc(const f32avx512& value)
-{
-    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 round(const f32avx512& value)
-{
-    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 floor(const f64avx512& value)
-{
-    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 ceil(const f64avx512& value)
-{
-    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 trunc(const f64avx512& value)
-{
-    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 round(const f64avx512& value)
-{
-    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); }
-KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>)
-
-#else
-
-// fallback
-
-template <typename T>
-constexpr inline T fp_precision_limit = 4503599627370496.0;
-template <>
-constexpr inline f32 fp_precision_limit<f32> = 16777216.0f;
-
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x)
-{
-    vec<f32, N> t = innercast<f32>(innercast<i32>(x));
-    return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x)
-{
-    vec<f64, N> t = innercast<f64>(innercast<i64>(x));
-    return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x)
-{
-    vec<f32, N> t = innercast<f32>(innercast<i32>(x));
-    return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x)
-{
-    vec<f64, N> t = innercast<f64>(innercast<i64>(x));
-    return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x)
-{
-    return select(abs(x) >= fp_precision_limit<f32>, x,
-                  innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x))));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x)
-{
-    return select(abs(x) >= fp_precision_limit<f64>, x,
-                  innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x))));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x)
-{
-    return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x)));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x)
-{
-    return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x)));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x)
-{
-    return x - floor(x);
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x)
-{
-    return x - floor(x);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&)
-{
-    return T(0);
-}
-
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value)
-{
-    return innercast<IT>(floor(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value)
-{
-    return innercast<IT>(ceil(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value)
-{
-    return innercast<IT>(trunc(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value)
-{
-    return innercast<IT>(round(value));
-}
-
-KFR_HANDLE_SCALAR(floor)
-KFR_HANDLE_SCALAR(ceil)
-KFR_HANDLE_SCALAR(round)
-KFR_HANDLE_SCALAR(trunc)
-KFR_HANDLE_SCALAR(fract)
-KFR_HANDLE_SCALAR(ifloor)
-KFR_HANDLE_SCALAR(iceil)
-KFR_HANDLE_SCALAR(iround)
-KFR_HANDLE_SCALAR(itrunc)
-} // namespace intrinsics
-KFR_I_FN(floor)
-KFR_I_FN(ceil)
-KFR_I_FN(round)
-KFR_I_FN(trunc)
-KFR_I_FN(fract)
-KFR_I_FN(ifloor)
-KFR_I_FN(iceil)
-KFR_I_FN(iround)
-KFR_I_FN(itrunc)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
-
-#undef KFR_mm_trunc_ps
-#undef KFR_mm_roundnearest_ps
-#undef KFR_mm_trunc_pd
-#undef KFR_mm_roundnearest_pd
-#undef KFR_mm_trunc_ss
-#undef KFR_mm_roundnearest_ss
-#undef KFR_mm_trunc_sd
-#undef KFR_mm_roundnearest_sd
-#undef KFR_mm_floor_ss
-#undef KFR_mm_floor_sd
-#undef KFR_mm_ceil_ss
-#undef KFR_mm_ceil_sd
-#undef KFR_mm256_trunc_ps
-#undef KFR_mm256_roundnearest_ps
-#undef KFR_mm256_trunc_pd
-#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/math/impl/saturation.hpp b/include/kfr/math/impl/saturation.hpp
@@ -1,205 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/select.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-// Generic functions
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
-{
-    using UT               = utype<T>;
-    constexpr size_t shift = typebits<UT>::bits - 1;
-    vec<UT, N> aa          = bitcast<UT>(a);
-    vec<UT, N> bb          = bitcast<UT>(b);
-    const vec<UT, N> sum   = aa + bb;
-    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
-    return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    using UT               = utype<T>;
-    constexpr size_t shift = typebits<UT>::bits - 1;
-    vec<UT, N> aa          = bitcast<UT>(a);
-    vec<UT, N> bb          = bitcast<UT>(b);
-    const vec<UT, N> diff  = aa - bb;
-    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
-    return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
-{
-    const vec<T, N> t = allonesvector(a);
-    return select(a > t - b, t, a + b);
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return select(a < b, zerovector(a), a - b);
-}
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); }
-KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); }
-KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); }
-
-KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); }
-KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); }
-KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); }
-
-KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
-KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
-
-KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
-KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); }
-
-KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); }
-
-KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
-KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
-
-KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
-KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); }
-KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); }
-
-KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b)
-{
-    return saturated_unsigned_add(a, b);
-}
-KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b)
-{
-    return saturated_unsigned_add(a, b);
-}
-KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b)
-{
-    return saturated_unsigned_sub(a, b);
-}
-KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b)
-{
-    return saturated_unsigned_sub(a, b);
-}
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); }
-KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); }
-KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); }
-KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); }
-KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); }
-KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); }
-KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); }
-KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); }
-
-KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); }
-KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); }
-KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); }
-KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); }
-KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); }
-KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); }
-KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); }
-KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); }
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#else
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
-KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_signed_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
-KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_unsigned_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
-KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_signed_sub(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
-KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_unsigned_sub(a, b);
-}
-#endif
-KFR_HANDLE_SCALAR(satadd)
-KFR_HANDLE_SCALAR(satsub)
-} // namespace intrinsics
-KFR_I_FN(satadd)
-KFR_I_FN(satsub)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp
@@ -1,331 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 2 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y)
-{
-    return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y)
-{
-    return _mm_blendv_ps(y.v, x.v, m.v);
-}
-KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
-{
-    return _mm_blendv_pd(y.v, x.v, m.v);
-}
-
-#if defined CMT_ARCH_AVX
-KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
-{
-    return _mm256_blendv_pd(y.v, x.v, m.v);
-}
-KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
-{
-    return _mm256_blendv_ps(y.v, x.v, m.v);
-}
-#endif
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
-{
-    return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y)
-{
-    return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
-}
-KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y)
-{
-    return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
-}
-KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y)
-{
-    return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y)
-{
-    return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y)
-{
-    return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y)
-{
-    return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y)
-{
-    return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y)
-{
-    return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y)
-{
-    return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y)
-{
-    return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    constexpr size_t Nout = next_simd_width<T>(N);
-    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
-        .shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
-    //    return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
-{
-    constexpr size_t Nout = next_simd_width<T>(N);
-    return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
-{
-    return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
-{
-    constexpr size_t Nout = next_simd_width<T>(N);
-    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
-{
-    return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
-{
-    constexpr size_t Nout = next_simd_width<T>(N);
-    return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
-        .shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
-{
-    return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
-}
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
-{
-    return vbslq_f32(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y)
-{
-    return vbslq_s8(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y)
-{
-    return vbslq_u8(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
-{
-    return vbslq_s16(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
-{
-    return vbslq_u16(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
-{
-    return vbslq_s32(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
-{
-    return vbslq_u32(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
-{
-    return vbslq_s64(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
-{
-    return vbslq_u64(m.v, x.v, y.v);
-}
-
-#ifdef CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
-{
-    return vbslq_f64(m.v, x.v, y.v);
-}
-#else
-KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
-{
-    return y ^ ((x ^ y) & m.asvec());
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    constexpr size_t Nout = next_simd_width<T>(N);
-    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
-        .shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
-{
-    return select(m, vec<T, N>(x), vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
-{
-    return select(m, x, vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
-{
-    return select(m, vec<T, N>(x), y);
-}
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y)
-{
-    return y ^ ((x ^ y) & m.asvec());
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
-{
-    return select(m, vec<T, N>(x), vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
-{
-    return select(m, x, vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
-{
-    return select(m, vec<T, N>(x), y);
-}
-#endif
-template <typename T1, typename T2>
-KFR_INTRINSIC common_type<T1, T2> select(bool m, const T1& x, const T2& y)
-{
-    return m ? x : y;
-}
-
-} // namespace intrinsics
-KFR_I_FN(select)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/sin_cos.hpp b/include/kfr/math/impl/sin_cos.hpp
@@ -22,10 +22,10 @@
  */
 #pragma once
 
-#include "../../math/abs.hpp"
-#include "../../math/min_max.hpp"
-#include "../../math/round.hpp"
-#include "../../math/select.hpp"
+#include "../../simd/abs.hpp"
+#include "../../simd/min_max.hpp"
+#include "../../simd/round.hpp"
+#include "../../simd/select.hpp"
 #include "../../simd/constants.hpp"
 #include "../../simd/impl/function.hpp"
 #include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/tan.hpp b/include/kfr/math/impl/tan.hpp
@@ -22,12 +22,12 @@
  */
 #pragma once
 
-#include "../../math/abs.hpp"
-#include "../../math/select.hpp"
-#include "../../math/sin_cos.hpp"
+#include "../../simd/abs.hpp"
 #include "../../simd/constants.hpp"
 #include "../../simd/impl/function.hpp"
 #include "../../simd/operators.hpp"
+#include "../../simd/select.hpp"
+#include "../sin_cos.hpp"
 
 namespace kfr
 {
diff --git a/include/kfr/math/interpolation.hpp b/include/kfr/math/interpolation.hpp
@@ -25,7 +25,7 @@
  */
 #pragma once
 
-#include "select.hpp"
+#include "../simd/select.hpp"
 #include "sin_cos.hpp"
 
 namespace kfr
diff --git a/include/kfr/simd.hpp b/include/kfr/simd.hpp
@@ -22,15 +22,23 @@
  */
 #pragma once
 
+#include "simd/abs.hpp"
+#include "simd/clamp.hpp"
 #include "simd/comparison.hpp"
 #include "simd/complex.hpp"
 #include "simd/constants.hpp"
 #include "simd/digitreverse.hpp"
 #include "simd/horizontal.hpp"
+#include "simd/logical.hpp"
 #include "simd/mask.hpp"
+#include "simd/min_max.hpp"
 #include "simd/operators.hpp"
 #include "simd/platform.hpp"
 #include "simd/read_write.hpp"
+#include "simd/round.hpp"
+#include "simd/saturation.hpp"
+#include "simd/select.hpp"
 #include "simd/shuffle.hpp"
+#include "simd/sort.hpp"
 #include "simd/types.hpp"
 #include "simd/vec.hpp"
diff --git a/include/kfr/math/abs.hpp b/include/kfr/simd/abs.hpp
diff --git a/include/kfr/math/clamp.hpp b/include/kfr/simd/clamp.hpp
diff --git a/include/kfr/simd/impl/abs.hpp b/include/kfr/simd/impl/abs.hpp
@@ -0,0 +1,138 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../select.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+    return x & special_constants<T>::invhighbitmask();
+}
+
+KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT
+{
+    const __m128i sh  = _mm_srai_epi32(x.v, 31);
+    const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+    return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk);
+}
+KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); }
+KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); }
+KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); }
+KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT
+{
+    const __m256i sh  = _mm256_srai_epi32(x.v, 31);
+    const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+    return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk);
+}
+KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); }
+KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); }
+KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); }
+KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); }
+#else
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); }
+#endif
+
+KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; }
+
+KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); }
+#else
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT
+{
+    return x & special_constants<f64>::invhighbitmask();
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_1(abs)
+
+#else
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+    return x & special_constants<T>::invhighbitmask();
+}
+
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+    return select(x >= T(0), x, -x);
+}
+#endif
+KFR_HANDLE_SCALAR(abs)
+} // namespace intrinsics
+
+KFR_I_FN(abs)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/clamp.hpp b/include/kfr/simd/impl/clamp.hpp
@@ -0,0 +1,55 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../min_max.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T>
+KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi)
+{
+    return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
+{
+    return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
+{
+    return max(min(x, hi), zerovector<T, N>());
+}
+} // namespace intrinsics
+KFR_I_FN(clamp)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/logical.hpp b/include/kfr/simd/impl/logical.hpp
@@ -0,0 +1,284 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../abs.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+#if defined CMT_ARCH_SSE41
+
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+#endif
+
+#if defined CMT_ARCH_AVX
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+
+#if defined CMT_ARCH_AVX512
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const mf32avx512& x)
+{
+    return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mf64avx512& x)
+{
+    return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+
+#endif
+
+#elif defined CMT_ARCH_SSE41
+KFR_INTRINSIC bool bittestany(const mf32sse& x)
+{
+    return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
+}
+KFR_INTRINSIC bool bittestany(const mf64sse& x)
+{
+    return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
+}
+KFR_INTRINSIC bool bittestall(const mf32sse& x)
+{
+    return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
+}
+KFR_INTRINSIC bool bittestall(const mf64sse& x)
+{
+    return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
+}
+#endif
+
+#if !defined CMT_ARCH_SSE41
+
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); }
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); }
+
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); }
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+    return bittestall(expand_simd(a, bit<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+    return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+    return bittestany(expand_simd(a, bit<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+    return bittestany(low(a)) || bittestany(high(a));
+}
+
+#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC bool bittestall(const mu32neon& a)
+{
+    const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+    return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
+}
+
+KFR_INTRINSIC bool bittestany(const mu32neon& a)
+{
+    const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+    return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
+}
+KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+
+KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+    return bittestall(expand_simd(a, bit<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+    return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+    return bittestany(expand_simd(a, bit<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+    return bittestany(low(a)) || bittestany(high(a));
+}
+
+#else
+
+template <typename T, size_t N>
+KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x)
+{
+    typename bitmask<N>::type val = 0;
+    for (size_t i = 0; i < N; i++)
+    {
+        val |= static_cast<int>(x[i]) << i;
+    }
+    return val;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x)
+{
+    return getmask(x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y)
+{
+    return bittestany(x & y);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x)
+{
+    return !getmask(~x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y)
+{
+    return !bittestany(~x & y);
+}
+#endif
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/min_max.hpp b/include/kfr/simd/impl/min_max.hpp
@@ -0,0 +1,236 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../abs.hpp"
+#include "../select.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); }
+
+KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); }
+KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); }
+#else
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
+#endif
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); }
+#endif
+
+#if defined CMT_ARCH_SSE41
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); }
+#else
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
+
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
+
+KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); }
+KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); }
+#else
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return select(x < y, x, y);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return select(x > y, x, y);
+}
+#endif
+
+template <typename T>
+KFR_INTRINSIC T min(initialvalue<T>)
+{
+    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+                                                : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T max(initialvalue<T>)
+{
+    return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
+                                                : std::numeric_limits<T>::min();
+}
+template <typename T>
+KFR_INTRINSIC T absmin(initialvalue<T>)
+{
+    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+                                                : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T absmax(initialvalue<T>)
+{
+    return 0;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return min(abs(x), abs(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return max(abs(x), abs(y));
+}
+
+KFR_HANDLE_SCALAR(min)
+KFR_HANDLE_SCALAR(max)
+KFR_HANDLE_SCALAR(absmin)
+KFR_HANDLE_SCALAR(absmax)
+} // namespace intrinsics
+KFR_I_FN(min)
+KFR_I_FN(max)
+KFR_I_FN(absmin)
+KFR_I_FN(absmax)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/round.hpp b/include/kfr/simd/impl/round.hpp
@@ -0,0 +1,282 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "function.hpp"
+#include "../operators.hpp"
+#include "abs.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ss(V)                                                                            \
+    _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_sd(V)                                                                            \
+    _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
+#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
+
+#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); }
+KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); }
+KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); }
+KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); }
+KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); }
+KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); }
+KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); }
+KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); }
+
+#if defined CMT_ARCH_AVX
+
+KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); }
+KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); }
+KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); }
+KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); }
+KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); }
+KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); }
+KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+
+KFR_INTRINSIC f32avx512 floor(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 ceil(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 trunc(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 round(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 floor(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 ceil(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 trunc(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 round(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>)
+
+#else
+
+// fallback
+
+template <typename T>
+constexpr inline T fp_precision_limit = 4503599627370496.0;
+template <>
+constexpr inline f32 fp_precision_limit<f32> = 16777216.0f;
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x)
+{
+    vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+    return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x)
+{
+    vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+    return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x)
+{
+    vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+    return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x)
+{
+    vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+    return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f32>, x,
+                  innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f64>, x,
+                  innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x)
+{
+    return x - floor(x);
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x)
+{
+    return x - floor(x);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&)
+{
+    return T(0);
+}
+
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value)
+{
+    return innercast<IT>(floor(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value)
+{
+    return innercast<IT>(ceil(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value)
+{
+    return innercast<IT>(trunc(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value)
+{
+    return innercast<IT>(round(value));
+}
+
+KFR_HANDLE_SCALAR(floor)
+KFR_HANDLE_SCALAR(ceil)
+KFR_HANDLE_SCALAR(round)
+KFR_HANDLE_SCALAR(trunc)
+KFR_HANDLE_SCALAR(fract)
+KFR_HANDLE_SCALAR(ifloor)
+KFR_HANDLE_SCALAR(iceil)
+KFR_HANDLE_SCALAR(iround)
+KFR_HANDLE_SCALAR(itrunc)
+} // namespace intrinsics
+KFR_I_FN(floor)
+KFR_I_FN(ceil)
+KFR_I_FN(round)
+KFR_I_FN(trunc)
+KFR_I_FN(fract)
+KFR_I_FN(ifloor)
+KFR_I_FN(iceil)
+KFR_I_FN(iround)
+KFR_I_FN(itrunc)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+#undef KFR_mm_trunc_ps
+#undef KFR_mm_roundnearest_ps
+#undef KFR_mm_trunc_pd
+#undef KFR_mm_roundnearest_pd
+#undef KFR_mm_trunc_ss
+#undef KFR_mm_roundnearest_ss
+#undef KFR_mm_trunc_sd
+#undef KFR_mm_roundnearest_sd
+#undef KFR_mm_floor_ss
+#undef KFR_mm_floor_sd
+#undef KFR_mm_ceil_ss
+#undef KFR_mm_ceil_sd
+#undef KFR_mm256_trunc_ps
+#undef KFR_mm256_roundnearest_ps
+#undef KFR_mm256_trunc_pd
+#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/simd/impl/saturation.hpp b/include/kfr/simd/impl/saturation.hpp
@@ -0,0 +1,205 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../select.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+// Generic functions
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+    using UT               = utype<T>;
+    constexpr size_t shift = typebits<UT>::bits - 1;
+    vec<UT, N> aa          = bitcast<UT>(a);
+    vec<UT, N> bb          = bitcast<UT>(b);
+    const vec<UT, N> sum   = aa + bb;
+    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+    return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    using UT               = utype<T>;
+    constexpr size_t shift = typebits<UT>::bits - 1;
+    vec<UT, N> aa          = bitcast<UT>(a);
+    vec<UT, N> bb          = bitcast<UT>(b);
+    const vec<UT, N> diff  = aa - bb;
+    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+    return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+    const vec<T, N> t = allonesvector(a);
+    return select(a > t - b, t, a + b);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return select(a < b, zerovector(a), a - b);
+}
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); }
+KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b)
+{
+    return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b)
+{
+    return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b)
+{
+    return saturated_unsigned_sub(a, b);
+}
+KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b)
+{
+    return saturated_unsigned_sub(a, b);
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); }
+
+KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); }
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#else
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_signed_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_unsigned_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_signed_sub(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_unsigned_sub(a, b);
+}
+#endif
+KFR_HANDLE_SCALAR(satadd)
+KFR_HANDLE_SCALAR(satsub)
+} // namespace intrinsics
+KFR_I_FN(satadd)
+KFR_I_FN(satsub)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/select.hpp b/include/kfr/simd/impl/select.hpp
@@ -0,0 +1,331 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y)
+{
+    return _mm_blendv_ps(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
+{
+    return _mm_blendv_pd(y.v, x.v, m.v);
+}
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
+{
+    return _mm256_blendv_pd(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
+{
+    return _mm256_blendv_ps(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+        .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
+    //    return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
+{
+    return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
+{
+    return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
+        .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
+{
+    return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
+}
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
+{
+    return vbslq_f32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y)
+{
+    return vbslq_s8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y)
+{
+    return vbslq_u8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
+{
+    return vbslq_s16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
+{
+    return vbslq_u16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
+{
+    return vbslq_s32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
+{
+    return vbslq_u32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
+{
+    return vbslq_s64(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
+{
+    return vbslq_u64(m.v, x.v, y.v);
+}
+
+#ifdef CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
+{
+    return vbslq_f64(m.v, x.v, y.v);
+}
+#else
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
+{
+    return y ^ ((x ^ y) & m.asvec());
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+        .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
+{
+    return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
+{
+    return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
+{
+    return select(m, vec<T, N>(x), y);
+}
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y)
+{
+    return y ^ ((x ^ y) & m.asvec());
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
+{
+    return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
+{
+    return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
+{
+    return select(m, vec<T, N>(x), y);
+}
+#endif
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> select(bool m, const T1& x, const T2& y)
+{
+    return m ? x : y;
+}
+
+} // namespace intrinsics
+KFR_I_FN(select)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/logical.hpp b/include/kfr/simd/logical.hpp
diff --git a/include/kfr/math/min_max.hpp b/include/kfr/simd/min_max.hpp
diff --git a/include/kfr/math/round.hpp b/include/kfr/simd/round.hpp
diff --git a/include/kfr/math/saturation.hpp b/include/kfr/simd/saturation.hpp
diff --git a/include/kfr/math/select.hpp b/include/kfr/simd/select.hpp
diff --git a/include/kfr/simd/sort.hpp b/include/kfr/simd/sort.hpp
@@ -0,0 +1,103 @@
+/** @addtogroup utility
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 2 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "min_max.hpp"
+#include "shuffle.hpp"
+#include "vec.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Sort the elements in the vector in ascending order
+ * @param x input vector
+ * @return sorted vector
+ * @code
+ * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(-10, 1, 2, 1000));
+ * @endcode
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x)
+{
+    constexpr size_t Nhalf = N / 2;
+    vec<T, Nhalf> e        = low(x);
+    vec<T, Nhalf> o        = high(x);
+    constexpr auto blend0  = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
+    for (size_t i = 0; i < Nhalf; i++)
+    {
+        vec<T, Nhalf> t;
+        t = min(e, o);
+        o = max(e, o);
+        o = rotateright<1>(o);
+        e = t;
+        t = max(e, o);
+        o = min(e, o);
+        e = t;
+        t = blend(e, o, blend0);
+        o = blend(o, e, blend0);
+        o = rotateleft<1>(o);
+        e = t;
+    }
+    return interleavehalves(concat(e, o));
+}
+
+/**
+ * @brief Sort the elements in the vector in descending order
+ * @param x input vector
+ * @return sorted vector
+ * @code
+ * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(1000, 2, 1, -10));
+ * @endcode
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x)
+{
+    constexpr size_t Nhalf = N / 2;
+    vec<T, Nhalf> e        = low(x);
+    vec<T, Nhalf> o        = high(x);
+    constexpr auto blend0  = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
+    for (size_t i = 0; i < Nhalf; i++)
+    {
+        vec<T, Nhalf> t;
+        t = max(e, o);
+        o = min(e, o);
+        o = rotateright<1>(o);
+        e = t;
+        t = min(e, o);
+        o = max(e, o);
+        e = t;
+        t = blend(e, o, blend0);
+        o = blend(o, e, blend0);
+        o = rotateleft<1>(o);
+        e = t;
+    }
+    return interleavehalves(concat(e, o));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/sources.cmake b/sources.cmake
@@ -27,13 +27,14 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/math_expressions.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/old_basic_expressions.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/random_bits.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/shape.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_expressions.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/tensor.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/static_array.hpp
@@ -91,65 +92,66 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h
     ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_mp3.h
     ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/abs.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/clamp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/complex_math.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/gamma.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/hyperbolic.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/interpolation.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/logical.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/log_exp.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/min_max.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/modzerobessel.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/round.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/saturation.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/select.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/sin_cos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/sqrt.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/tan.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/abs.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/asin_acos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/atan.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/clamp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/gamma.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/hyperbolic.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/logical.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/log_exp.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/min_max.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/modzerobessel.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/round.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/saturation.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/select.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sin_cos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sqrt.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/tan.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid_auto.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/abs.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/clamp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/comparison.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/complex.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/complex_type.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/constants.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/digitreverse.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/horizontal.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/logical.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/mask.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/min_max.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/operators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/platform.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/read_write.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/round.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/saturation.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/select.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/shuffle.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/sort.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/types.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/vec.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/abs.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_clang.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_generic.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_clang.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_complex.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/clamp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/logical.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/min_max.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/round.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/saturation.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/select.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specialconstants.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h
@@ -187,18 +189,18 @@ set(
     ${PROJECT_SOURCE_DIR}/tests/unit/base/tensor.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/graphics/color.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/graphics/geometry.cpp
-    ${PROJECT_SOURCE_DIR}/tests/unit/math/abs.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/math/asin_acos.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/math/atan.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/math/hyperbolic.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/math/log_exp.cpp
-    ${PROJECT_SOURCE_DIR}/tests/unit/math/min_max.cpp
-    ${PROJECT_SOURCE_DIR}/tests/unit/math/round.cpp
-    ${PROJECT_SOURCE_DIR}/tests/unit/math/select.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/math/sin_cos.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/math/tan.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/abs.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/simd/complex.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/min_max.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/simd/operators.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/round.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/select.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/simd/shuffle.cpp
     ${PROJECT_SOURCE_DIR}/tests/unit/simd/vec.cpp
 )
diff --git a/tests/unit/math/abs.cpp b/tests/unit/simd/abs.cpp
diff --git a/tests/unit/math/min_max.cpp b/tests/unit/simd/min_max.cpp
diff --git a/tests/unit/math/round.cpp b/tests/unit/simd/round.cpp
diff --git a/tests/unit/math/select.cpp b/tests/unit/simd/select.cpp

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	include/kfr/base.hpp	\|	2	--
M	include/kfr/base/basic_expressions.hpp	\|	121	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
M	include/kfr/base/conversion.hpp	\|	2	+-
M	include/kfr/base/expression.hpp	\|	25	+++++++++++++++++++++++--
M	include/kfr/base/generators.hpp	\|	226	+++++++++++++++++++++++++++++++++++++++----------------------------------------
M	include/kfr/base/pointer.hpp	\|	204	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	include/kfr/base/random.hpp	\|	80	+------------------------------------------------------------------------------
A	include/kfr/base/random_bits.hpp	\|	114	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/base/reduce.hpp	\|	2	+-
M	include/kfr/base/shape.hpp	\|	36	+++++++++++++++++++++++++++---------
D	include/kfr/base/sort.hpp	\|	103	-------------------------------------------------------------------------------
M	include/kfr/base/tensor.hpp	\|	6	+++---
M	include/kfr/math.hpp	\|	7	-------
M	include/kfr/math/complex_math.hpp	\|	6	+++---
D	include/kfr/math/impl/abs.hpp	\|	138	-------------------------------------------------------------------------------
M	include/kfr/math/impl/asin_acos.hpp	\|	6	+++---
M	include/kfr/math/impl/atan.hpp	\|	6	+++---
D	include/kfr/math/impl/clamp.hpp	\|	55	-------------------------------------------------------
M	include/kfr/math/impl/hyperbolic.hpp	\|	8	++++----
M	include/kfr/math/impl/log_exp.hpp	\|	10	+++++-----
D	include/kfr/math/impl/logical.hpp	\|	284	-------------------------------------------------------------------------------
D	include/kfr/math/impl/min_max.hpp	\|	236	-------------------------------------------------------------------------------
D	include/kfr/math/impl/round.hpp	\|	282	-------------------------------------------------------------------------------
D	include/kfr/math/impl/saturation.hpp	\|	205	-------------------------------------------------------------------------------
D	include/kfr/math/impl/select.hpp	\|	331	-------------------------------------------------------------------------------
M	include/kfr/math/impl/sin_cos.hpp	\|	8	++++----
M	include/kfr/math/impl/tan.hpp	\|	6	+++---
M	include/kfr/math/interpolation.hpp	\|	2	+-
M	include/kfr/simd.hpp	\|	8	++++++++
R	include/kfr/math/abs.hpp -> include/kfr/simd/abs.hpp	\|	0
R	include/kfr/math/clamp.hpp -> include/kfr/simd/clamp.hpp	\|	0
A	include/kfr/simd/impl/abs.hpp	\|	138	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	include/kfr/simd/impl/clamp.hpp	\|	55	+++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	include/kfr/simd/impl/logical.hpp	\|	284	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	include/kfr/simd/impl/min_max.hpp	\|	236	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	include/kfr/simd/impl/round.hpp	\|	282	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	include/kfr/simd/impl/saturation.hpp	\|	205	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	include/kfr/simd/impl/select.hpp	\|	331	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	include/kfr/math/logical.hpp -> include/kfr/simd/logical.hpp	\|	0
R	include/kfr/math/min_max.hpp -> include/kfr/simd/min_max.hpp	\|	0
R	include/kfr/math/round.hpp -> include/kfr/simd/round.hpp	\|	0
R	include/kfr/math/saturation.hpp -> include/kfr/simd/saturation.hpp	\|	0
R	include/kfr/math/select.hpp -> include/kfr/simd/select.hpp	\|	0
A	include/kfr/simd/sort.hpp	\|	103	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	sources.cmake	\|	40	+++++++++++++++++++++-------------------
R	tests/unit/math/abs.cpp -> tests/unit/simd/abs.cpp	\|	0
R	tests/unit/math/min_max.cpp -> tests/unit/simd/min_max.cpp	\|	0
R	tests/unit/math/round.cpp -> tests/unit/simd/round.cpp	\|	0
R	tests/unit/math/select.cpp -> tests/unit/simd/select.cpp	\|	0