Refactoring: Improve compilation time - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 3945a8f00b5062cc698afda300a07fddcdf3ba3a
parent 6ff1acd1d2216a0655755c48a805cb14c130c150
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Mon, 25 Jul 2016 13:28:15 +0300

Refactoring: Improve compilation time

Diffstat:
M examples/biquads.cpp  | 2 --
M examples/dft.cpp  | 5 ++---
M examples/fir.cpp  | 2 --
M examples/resampling.cpp  | 5 -----
M examples/window.cpp  | 2 --
M include/kfr/base/abs.hpp  | 128 +++++++++++++++++++++++++------------------------------------------------------
M include/kfr/base/asin_acos.hpp  | 66 +++++++++++++++++++++---------------------------------------------
M include/kfr/base/atan.hpp  | 407 +++++++++++++++++++++++++++++++++++++++----------------------------------------
A include/kfr/base/clamp.hpp  | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/base/complex.hpp  | 455 ++++++++++++++++++++++++++++++++++---------------------------------------------
M include/kfr/base/gamma.hpp  | 66 +++++++++++++++++++++++++++---------------------------------------
M include/kfr/base/hyperbolic.hpp  | 178 ++++++++++++++++++++++++++++++++++++-------------------------------------------
M include/kfr/base/log_exp.hpp  | 768 ++++++++++++++++++++++++++++++++++++++-----------------------------------------
M include/kfr/base/logical.hpp  | 466 +++++++++++++++++++++++++++++--------------------------------------------------
M include/kfr/base/min_max.hpp  | 436 +++++++++++++++++++++----------------------------------------------------------
A include/kfr/base/modzerobessel.hpp  | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/base/operators.hpp  | 36 ++++++++++++++++++++++++++++++++++++
M include/kfr/base/round.hpp  | 377 ++++++++++++++++++++++++++++++++-----------------------------------------------
M include/kfr/base/saturation.hpp  | 246 +++++++++++++++++++++++++++++++------------------------------------------------
M include/kfr/base/select.hpp  | 205 ++++++++++++++++++++-----------------------------------------------------------
M include/kfr/base/sin_cos.hpp  | 685 +++++++++++++++++++++++++++++++++----------------------------------------------
M include/kfr/base/sqrt.hpp  | 79 ++++++++++++++++++++++---------------------------------------------------------
M include/kfr/base/tan.hpp  | 227 +++++++++++++++++++++++++++++++++++--------------------------------------------
M include/kfr/dft/conv.hpp  | 1 -
M include/kfr/dft/fft.hpp  | 4 ++--
M include/kfr/dft/ft.hpp  | 2 +-
M include/kfr/dsp/fir.hpp  | 115 +++++++++++++++++++++++++++++++++----------------------------------------------
M include/kfr/dsp/fir_design.hpp  | 256 +++++++++++++++++++++++++------------------------------------------------------
M include/kfr/dsp/fracdelay.hpp  | 8 ++------
M include/kfr/dsp/oscillators.hpp  | 300 +++++++++++++++++++++++++++++++++----------------------------------------------
M include/kfr/dsp/resample.hpp  | 303 ++++++++++++++++++++++++++++++++++++-------------------------------------------
M include/kfr/dsp/units.hpp  | 239 ++++++++++++++++++++++++++++++++++++-------------------------------------------
M include/kfr/dsp/weighting.hpp  | 139 +++++++++++++++++++++++++++++++++++++------------------------------------------
M include/kfr/dsp/window.hpp  | 901 ++++++++++++++++++++++++++++++++++++-------------------------------------------
M include/kfr/expressions/reduce.hpp  | 228 +++++++++++++++++++++++++++-----------------------------------------------------
M include/kfr/math.hpp  | 4 ----
M sources.cmake  | 3 ++-
M tests/basic_vector_test.cpp  | 119 +++++++++++++++++++++++++------------------------------------------------------
M tests/complex_test.cpp  | 5 ++++-
M tests/conv_test.cpp  | 2 +-
M tests/dft_test.cpp  | 1 -
M tests/empty_test.cpp  | 2 --
M tests/fracdelay_test.cpp  | 16 ++++++++--------
M tests/stat_test.cpp  | 40 ++++++++++++++++++++--------------------

44 files changed, 3271 insertions(+), 4444 deletions(-)
diff --git a/examples/biquads.cpp b/examples/biquads.cpp
@@ -25,13 +25,11 @@
 #include <kfr/io/python_plot.hpp>
 
 using namespace kfr;
-using namespace kfr::native;
 
 int main(int argc, char** argv)
 {
     println(library_version());
 
-    using namespace native;
     const std::string options = "phaseresp=True";
 
     univector<double, 128> output;
diff --git a/examples/dft.cpp b/examples/dft.cpp
@@ -17,7 +17,6 @@
 #include <kfr/dsp/oscillators.hpp>
 #include <kfr/dsp/units.hpp>
 #include <kfr/expressions/basic.hpp>
-#include <kfr/expressions/operators.hpp>
 #include <kfr/expressions/reduce.hpp>
 #include <kfr/math.hpp>
 #include <kfr/misc/random.hpp>
@@ -52,8 +51,8 @@ int main(int argc, char** argv)
     // get magnitude and convert to decibels
     univector<float_type, size> dB = amp_to_dB(cabs(out));
 
-    println("max  = ", max(dB));
-    println("min  = ", min(dB));
+    println("max  = ", maxof(dB));
+    println("min  = ", minof(dB));
     println("mean = ", mean(dB));
     println("rms  = ", rms(dB));
 
diff --git a/examples/fir.cpp b/examples/fir.cpp
@@ -30,13 +30,11 @@
 #include <iostream>
 
 using namespace kfr;
-using namespace kfr::native;
 
 int main(int argc, char** argv)
 {
     println(library_version());
 
-    using namespace native;
     const std::string options = "phaseresp=False";
 
     univector<double, 15> taps15;
diff --git a/examples/resampling.cpp b/examples/resampling.cpp
@@ -21,16 +21,12 @@
 // swept
 #include <kfr/dsp/oscillators.hpp>
 
-// operator overloading for expressions
-#include <kfr/expressions/operators.hpp>
-
 // plot_save()
 #include <kfr/io/python_plot.hpp>
 
 #include <iostream>
 
 using namespace kfr;
-using namespace kfr::native;
 
 constexpr size_t input_sr  = 96000;
 constexpr size_t output_sr = 44100;
@@ -41,7 +37,6 @@ int main(int argc, char** argv)
 {
     println(library_version());
 
-    using namespace native;
     const std::string options = "phaseresp=False";
 
     univector<f64> swept_sine = swept(0.5, len);
diff --git a/examples/window.cpp b/examples/window.cpp
@@ -22,13 +22,11 @@
 #include <kfr/io/python_plot.hpp>
 
 using namespace kfr;
-using namespace kfr::native;
 
 int main(int argc, char** argv)
 {
     println(library_version());
 
-    using namespace native;
     const std::string options = "freqresp=True, dots=True, padwidth=1024, "
                                 "log_freq=False, horizontal=False, normalized_freq=True";
 
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -26,114 +26,68 @@
 #include "operators.hpp"
 #include "select.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
 namespace internal
 {
-
-template <cpu_t cpu = cpu_t::native, cpu_t cc = cpu>
-struct in_abs : in_abs<older(cpu), cc>
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> abs(vec<T, N> x)
 {
-    struct fn_abs : in_abs<older(cpu), cc>::fn_abs, fn_disabled
-    {
-    };
-};
-
-template <cpu_t cc>
-struct in_abs<cpu_t::common, cc> : in_select<cc>
-{
-    constexpr static cpu_t cpu = cpu_t::common;
-
-private:
-    using in_select<cc>::select;
-
-public:
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> abs(vec<T, N> value)
-    {
-        return select(value >= T(), value, -value);
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> abs(vec<T, N> value)
-    {
-        return value & invhighbitmask<T>;
-    }
-
-    KFR_HANDLE_SCALAR(abs)
-    KFR_SPEC_FN(in_abs, abs)
-};
+    return x & internal::invhighbitmask<T>;
+}
 
-#ifdef CID_ARCH_X86
+#if defined CID_ARCH_SSSE3
 
-template <cpu_t cc>
-struct in_abs<cpu_t::ssse3, cc> : in_select<cc>
+template <typename T, size_t N>
+KFR_SINTRIN vec<i64, N> abs(vec<i64, N> x)
 {
-    constexpr static cpu_t cpu = cpu_t::ssse3;
-
-private:
-    using in_select<cc>::select;
-
-public:
-    template <size_t N>
-    KFR_SINTRIN vec<i64, N> abs(vec<i64, N> value)
-    {
-        return select(value >= 0, value, -value);
-    }
-
-    KFR_CPU_INTRIN(ssse3) i32sse abs(i32sse value) { return _mm_abs_epi32(*value); }
-    KFR_CPU_INTRIN(ssse3) i16sse abs(i16sse value) { return _mm_abs_epi16(*value); }
-    KFR_CPU_INTRIN(ssse3) i8sse abs(i8sse value) { return _mm_abs_epi8(*value); }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> abs(vec<T, N> value)
-    {
-        return value & invhighbitmask<T>;
-    }
-
-    KFR_HANDLE_ALL(abs)
-    KFR_HANDLE_SCALAR(abs)
-    KFR_SPEC_FN(in_abs, abs)
-};
+    return select(x >= T(), x, -x);
+}
+KFR_SINTRIN i32sse abs(i32sse value) { return _mm_abs_epi32(*value); }
+KFR_SINTRIN i16sse abs(i16sse value) { return _mm_abs_epi16(*value); }
+KFR_SINTRIN i8sse abs(i8sse value) { return _mm_abs_epi8(*value); }
+
+#if defined CID_ARCH_AVX2
+KFR_SINTRIN i32avx abs(i32avx value) { return _mm256_abs_epi32(*value); }
+KFR_SINTRIN i16avx abs(i16avx value) { return _mm256_abs_epi16(*value); }
+KFR_SINTRIN i8avx abs(i8avx value) { return _mm256_abs_epi8(*value); }
+#endif
 
-template <cpu_t cc>
-struct in_abs<cpu_t::avx2, cc> : in_abs<cpu_t::ssse3, cc>
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && !is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> abs(vec<T, N> a)
 {
-    constexpr static cpu_t cpu = cpu_t::avx2;
-    using in_abs<cpu_t::ssse3, cc>::abs;
+    return slice<0, N>(abs(expand_simd(a)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && !is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> abs(vec<T, N> a)
+{
+    return concat(abs(low(a)), abs(high(a)));
+}
 
-    KFR_CPU_INTRIN(avx2) i32avx abs(i32avx value) { return _mm256_abs_epi32(*value); }
-    KFR_CPU_INTRIN(avx2) i16avx abs(i16avx value) { return _mm256_abs_epi16(*value); }
-    KFR_CPU_INTRIN(avx2) i8avx abs(i8avx value) { return _mm256_abs_epi8(*value); }
+#else
 
-    KFR_HANDLE_ALL(abs)
-    KFR_HANDLE_SCALAR(abs)
-    KFR_SPEC_FN(in_abs, abs)
-};
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> abs(vec<T, N> x)
+{
+    return select(value >= T(), value, -value);
+}
 #endif
+KFR_HANDLE_SCALAR_1(abs)
+KFR_FN(abs)
 }
 
-namespace native
-{
-using fn_abs = internal::in_abs<>::fn_abs;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> abs(const T1& x)
+KFR_INTRIN T1 abs(const T1& x)
 {
-    return internal::in_abs<>::abs(x);
+    return internal::abs(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-
-KFR_INTRIN expr_func<fn_abs, E1> abs(E1&& x)
+KFR_INTRIN expr_func<internal::fn_abs, E1> abs(E1&& x)
 {
-    return { fn_abs(), std::forward<E1>(x) };
-}
+    return { {}, std::forward<E1>(x) };
 }
 }
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp
@@ -22,79 +22,55 @@
  */
 #pragma once
 
-#include "abs.hpp"
 #include "atan.hpp"
-#include "constants.hpp"
 #include "function.hpp"
-#include "min_max.hpp"
-#include "operators.hpp"
 #include "select.hpp"
-#include "shuffle.hpp"
 #include "sqrt.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
 namespace internal
 {
 
-template <cpu_t cpu = cpu_t::native>
-struct in_asin_acos : private in_select<cpu>, private in_atan<cpu>, private in_sqrt<cpu>
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> asin(vec<T, N> x)
 {
-private:
-    using in_atan<cpu>::atan2;
-    using in_sqrt<cpu>::sqrt;
-
-public:
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> asin(vec<T, N> x)
-    {
-        return atan2(x, sqrt(T(1) - x * x));
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> acos(vec<T, N> x)
-    {
-        return atan2(sqrt(T(1) - x * x), x);
-    }
-    KFR_SPEC_FN(in_asin_acos, asin)
-    KFR_SPEC_FN(in_asin_acos, acos)
-};
+    return atan2(x, sqrt(T(1) - x * x));
 }
 
-namespace native
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> acos(vec<T, N> x)
 {
-using fn_asin = internal::in_asin_acos<>::fn_asin;
+    return atan2(sqrt(T(1) - x * x), x);
+}
+KFR_HANDLE_SCALAR(asin)
+KFR_HANDLE_SCALAR(acos)
+KFR_FN(asin)
+KFR_FN(acos)
+}
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> asin(const T1& x)
+KFR_INTRIN T1 asin(const T1& x)
 {
-    return internal::in_asin_acos<>::asin(x);
+    return internal::asin(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_asin, E1> asin(E1&& x)
+KFR_INTRIN expr_func<internal::fn_asin, E1> asin(E1&& x)
 {
-    return { fn_asin(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_acos = internal::in_asin_acos<>::fn_acos;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> acos(const T1& x)
+KFR_INTRIN T1 acos(const T1& x)
 {
-    return internal::in_asin_acos<>::acos(x);
+    return internal::acos(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_acos, E1> acos(E1&& x)
+KFR_INTRIN expr_func<internal::fn_acos, E1> acos(E1&& x)
 {
-    return { fn_acos(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 }
-}
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -28,240 +28,231 @@
 #include "select.hpp"
 #include "sin_cos.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 namespace internal
 {
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_atan : in_trig<cc>, in_select<cc>, in_round<cc>, in_abs<cc>
+template <size_t N>
+KFR_SINTRIN vec<f32, N> atan2k(vec<f32, N> y, vec<f32, N> x)
 {
-private:
-    using in_abs<cc>::abs;
-    using in_round<cc>::floor;
-    using in_select<cc>::select;
-    using in_trig<cc>::mask_horner;
-    using in_select<cc>::sign;
+    vec<f32, N> s, t, u;
+    vec<i32, N> q;
+    q = select(x < 0, -2, 0);
+    x = select(x < 0, -x, x);
+    mask<i32, N> m;
+    m = y > x;
+    t = x;
+    x = select(m, y, x);
+    y = select(m, -t, y);
+    q = select(m, q + 1, q);
+    s = y / x;
+    t = s * s;
+    u = 0.00282363896258175373077393f;
+    u = fmadd(u, t, -0.0159569028764963150024414f);
+    u = fmadd(u, t, 0.0425049886107444763183594f);
+    u = fmadd(u, t, -0.0748900920152664184570312f);
+    u = fmadd(u, t, 0.106347933411598205566406f);
+    u = fmadd(u, t, -0.142027363181114196777344f);
+    u = fmadd(u, t, 0.199926957488059997558594f);
+    u = fmadd(u, t, -0.333331018686294555664062f);
+    t = u * t * s + s;
+    t = cast<f32>(q) * 1.5707963267948966192313216916398f + t;
+    return t;
+}
 
-public:
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> atan2k(vec<f32, N> y, vec<f32, N> x)
-    {
-        vec<f32, N> s, t, u;
-        vec<i32, N> q;
-        q = select(x < 0, -2, 0);
-        x = select(x < 0, -x, x);
-        mask<i32, N> m;
-        m = y > x;
-        t = x;
-        x = select(m, y, x);
-        y = select(m, -t, y);
-        q = select(m, q + 1, q);
-        s = y / x;
-        t = s * s;
-        u = 0.00282363896258175373077393f;
-        u = fmadd(u, t, -0.0159569028764963150024414f);
-        u = fmadd(u, t, 0.0425049886107444763183594f);
-        u = fmadd(u, t, -0.0748900920152664184570312f);
-        u = fmadd(u, t, 0.106347933411598205566406f);
-        u = fmadd(u, t, -0.142027363181114196777344f);
-        u = fmadd(u, t, 0.199926957488059997558594f);
-        u = fmadd(u, t, -0.333331018686294555664062f);
-        t = u * t * s + s;
-        t = cast<f32>(q) * 1.5707963267948966192313216916398f + t;
-        return t;
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> atan2k(vec<f64, N> y, vec<f64, N> x)
-    {
-        vec<f64, N> s, t, u;
-        vec<i64, N> q;
-        q = select(x < 0, -2ll, 0ll);
-        x = select(x < 0, -x, x);
-        vec<i64, N> m;
-        m = y > x;
-        t = x;
-        x = select(m, y, x);
-        y = select(m, -t, y);
-        q = select(m, q + 1ll, q);
-        s = y / x;
-        t = s * s;
-        u = -1.88796008463073496563746e-05;
-        u = fmadd(u, t, 0.000209850076645816976906797);
-        u = fmadd(u, t, -0.00110611831486672482563471);
-        u = fmadd(u, t, 0.00370026744188713119232403);
-        u = fmadd(u, t, -0.00889896195887655491740809);
-        u = fmadd(u, t, 0.016599329773529201970117);
-        u = fmadd(u, t, -0.0254517624932312641616861);
-        u = fmadd(u, t, 0.0337852580001353069993897);
-        u = fmadd(u, t, -0.0407629191276836500001934);
-        u = fmadd(u, t, 0.0466667150077840625632675);
-        u = fmadd(u, t, -0.0523674852303482457616113);
-        u = fmadd(u, t, 0.0587666392926673580854313);
-        u = fmadd(u, t, -0.0666573579361080525984562);
-        u = fmadd(u, t, 0.0769219538311769618355029);
-        u = fmadd(u, t, -0.090908995008245008229153);
-        u = fmadd(u, t, 0.111111105648261418443745);
-        u = fmadd(u, t, -0.14285714266771329383765);
-        u = fmadd(u, t, 0.199999999996591265594148);
-        u = fmadd(u, t, -0.333333333333311110369124);
-        t = u * t * s + s;
-        t = cast<f64>(q) * 1.5707963267948966192313216916398 + t;
-        return t;
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> atan2(vec<f32, N> y, vec<f32, N> x)
-    {
-        vec<f32, N> r = atan2k(abs(y), x);
-        constexpr f32 pi        = 3.1415926535897932384626433832795f;
-        constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f;
-        constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f;
-        r                       = mulsign(r, x);
-        r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
-        r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
-        r = select(y == 0.0f, fbitcast(ibitcast(sign(x) == -1.0f) & ibitcast(pi)), r);
-        r = fbitcast(ibitcast(isnan(x) || isnan(y)) | ibitcast(mulsign(r, y)));
-        return r;
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> atan2(vec<f64, N> y, vec<f64, N> x)
-    {
-        vec<f64, N> r = atan2k(abs(y), x);
-        constexpr f64 pi        = 3.1415926535897932384626433832795;
-        constexpr f64 pi_over_2 = 1.5707963267948966192313216916398;
-        constexpr f64 pi_over_4 = 0.78539816339744830961566084581988;
-        r                       = mulsign(r, x);
-        r = select(isinf(x) || x == 0.0, pi_over_2 - select(x, mulsign(pi_over_2, x), 0.0), r);
-        r = select(isinf(y), pi_over_2 - select(x, mulsign(pi_over_4, x), 0.0), r);
-        r = select(y == 0.0, fbitcast(ibitcast(sign(x) == -1.0) & ibitcast(pi)), r);
-        r = fbitcast(ibitcast(isnan(x) || isnan(y)) | ibitcast(mulsign(r, y)));
-        return r;
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> atan(vec<f32, N> s)
-    {
-        vec<f32, N> t, u;
-        vec<i32, N> q;
-        q = select(s < 0.f, 2, 0);
-        s = select(s < 0.f, -s, s);
-        q = select(s > 1.f, q | 1, q);
-        s = select(s > 1.f, 1.0f / s, s);
-        t = s * s;
-        u = 0.00282363896258175373077393f;
-        u = fmadd(u, t, -0.0159569028764963150024414f);
-        u = fmadd(u, t, 0.0425049886107444763183594f);
-        u = fmadd(u, t, -0.0748900920152664184570312f);
-        u = fmadd(u, t, 0.106347933411598205566406f);
-        u = fmadd(u, t, -0.142027363181114196777344f);
-        u = fmadd(u, t, 0.199926957488059997558594f);
-        u = fmadd(u, t, -0.333331018686294555664062f);
-        t = s + s * (t * u);
-        t = select((q & 1) != 0, 1.570796326794896557998982f - t, t);
-        t = select((q & 2) != 0, -t, t);
-        return t;
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> atan(vec<f64, N> s)
-    {
-        vec<f64, N> t, u;
-        vec<i64, N> q;
-        q = select(s < 0.0, 2ll, 0ll);
-        s = select(s < 0.0, -s, s);
-        q = select(s > 1.0, q | 1, q);
-        s = select(s > 1.0, 1.0 / s, s);
-        t = s * s;
-        u = -1.88796008463073496563746e-05;
-        u = fmadd(u, t, 0.000209850076645816976906797);
-        u = fmadd(u, t, -0.00110611831486672482563471);
-        u = fmadd(u, t, 0.00370026744188713119232403);
-        u = fmadd(u, t, -0.00889896195887655491740809);
-        u = fmadd(u, t, 0.016599329773529201970117);
-        u = fmadd(u, t, -0.0254517624932312641616861);
-        u = fmadd(u, t, 0.0337852580001353069993897);
-        u = fmadd(u, t, -0.0407629191276836500001934);
-        u = fmadd(u, t, 0.0466667150077840625632675);
-        u = fmadd(u, t, -0.0523674852303482457616113);
-        u = fmadd(u, t, 0.0587666392926673580854313);
-        u = fmadd(u, t, -0.0666573579361080525984562);
-        u = fmadd(u, t, 0.0769219538311769618355029);
-        u = fmadd(u, t, -0.090908995008245008229153);
-        u = fmadd(u, t, 0.111111105648261418443745);
-        u = fmadd(u, t, -0.14285714266771329383765);
-        u = fmadd(u, t, 0.199999999996591265594148);
-        u = fmadd(u, t, -0.333333333333311110369124);
-        t = s + s * (t * u);
-        t = select((q & 1) != 0, 1.570796326794896557998982 - t, t);
-        t = select((q & 2) != 0, -t, t);
-        return t;
-    }
-    template <typename T>
-    KFR_SINTRIN T atandeg(const T& x)
-    {
-        return atan(x) * c_radtodeg<T>;
-    }
-    template <typename T1, typename T2>
-    KFR_SINTRIN common_type<T1, T2> atan2deg(const T1& y, const T2& x)
-    {
-        return atan2(y, x) * c_radtodeg<common_type<T1, T2>>;
-    }
-    KFR_HANDLE_SCALAR(atan)
-    KFR_HANDLE_SCALAR(atan2)
-    KFR_SPEC_FN(in_atan, atan)
-    KFR_SPEC_FN(in_atan, atandeg)
-    KFR_SPEC_FN(in_atan, atan2)
-    KFR_SPEC_FN(in_atan, atan2deg)
-};
+template <size_t N>
+KFR_SINTRIN vec<f64, N> atan2k(vec<f64, N> y, vec<f64, N> x)
+{
+    vec<f64, N> s, t, u;
+    vec<i64, N> q;
+    q = select(x < 0, -2ll, 0ll);
+    x = select(x < 0, -x, x);
+    vec<i64, N> m;
+    m = y > x;
+    t = x;
+    x = select(m, y, x);
+    y = select(m, -t, y);
+    q = select(m, q + 1ll, q);
+    s = y / x;
+    t = s * s;
+    u = -1.88796008463073496563746e-05;
+    u = fmadd(u, t, 0.000209850076645816976906797);
+    u = fmadd(u, t, -0.00110611831486672482563471);
+    u = fmadd(u, t, 0.00370026744188713119232403);
+    u = fmadd(u, t, -0.00889896195887655491740809);
+    u = fmadd(u, t, 0.016599329773529201970117);
+    u = fmadd(u, t, -0.0254517624932312641616861);
+    u = fmadd(u, t, 0.0337852580001353069993897);
+    u = fmadd(u, t, -0.0407629191276836500001934);
+    u = fmadd(u, t, 0.0466667150077840625632675);
+    u = fmadd(u, t, -0.0523674852303482457616113);
+    u = fmadd(u, t, 0.0587666392926673580854313);
+    u = fmadd(u, t, -0.0666573579361080525984562);
+    u = fmadd(u, t, 0.0769219538311769618355029);
+    u = fmadd(u, t, -0.090908995008245008229153);
+    u = fmadd(u, t, 0.111111105648261418443745);
+    u = fmadd(u, t, -0.14285714266771329383765);
+    u = fmadd(u, t, 0.199999999996591265594148);
+    u = fmadd(u, t, -0.333333333333311110369124);
+    t = u * t * s + s;
+    t = cast<f64>(q) * 1.5707963267948966192313216916398 + t;
+    return t;
 }
-namespace native
+
+template <size_t N>
+KFR_SINTRIN vec<f32, N> atan2(vec<f32, N> y, vec<f32, N> x)
 {
-using fn_atan = internal::in_atan<>::fn_atan;
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> atan(const T1& y, const T2& x)
+    vec<f32, N> r = atan2k(abs(y), x);
+    constexpr f32 pi        = 3.1415926535897932384626433832795f;
+    constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f;
+    constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f;
+    r                       = mulsign(r, x);
+    r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
+    r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
+    r = select(y == 0.0f, fbitcast(ibitcast(x <  0) & ibitcast(pi)), r);
+    r = fbitcast(ibitcast(isnan(x) || isnan(y)) | ibitcast(mulsign(r, y)));
+    return r;
+}
+
+template <size_t N>
+KFR_SINTRIN vec<f64, N> atan2(vec<f64, N> y, vec<f64, N> x)
 {
-    return internal::in_atan<>::atan(y, x);
+    vec<f64, N> r = atan2k(abs(y), x);
+    constexpr f64 pi        = 3.1415926535897932384626433832795;
+    constexpr f64 pi_over_2 = 1.5707963267948966192313216916398;
+    constexpr f64 pi_over_4 = 0.78539816339744830961566084581988;
+    r                       = mulsign(r, x);
+    r = select(isinf(x) || x == 0.0, pi_over_2 - select(x, mulsign(pi_over_2, x), 0.0), r);
+    r = select(isinf(y), pi_over_2 - select(x, mulsign(pi_over_4, x), 0.0), r);
+    r = select(y == 0.0, fbitcast(ibitcast(x < 0) & ibitcast(pi)), r);
+    r = fbitcast(ibitcast(isnan(x) || isnan(y)) | ibitcast(mulsign(r, y)));
+    return r;
 }
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_atan, E1, E2> atan(E1&& y, E2&& x)
+
+template <size_t N>
+KFR_SINTRIN vec<f32, N> atan(vec<f32, N> s)
 {
-    return { fn_atan(), std::forward<E1>(y), std::forward<E2>(x) };
+    vec<f32, N> t, u;
+    vec<i32, N> q;
+    q = select(s < 0.f, 2, 0);
+    s = select(s < 0.f, -s, s);
+    q = select(s > 1.f, q | 1, q);
+    s = select(s > 1.f, 1.0f / s, s);
+    t = s * s;
+    u = 0.00282363896258175373077393f;
+    u = fmadd(u, t, -0.0159569028764963150024414f);
+    u = fmadd(u, t, 0.0425049886107444763183594f);
+    u = fmadd(u, t, -0.0748900920152664184570312f);
+    u = fmadd(u, t, 0.106347933411598205566406f);
+    u = fmadd(u, t, -0.142027363181114196777344f);
+    u = fmadd(u, t, 0.199926957488059997558594f);
+    u = fmadd(u, t, -0.333331018686294555664062f);
+    t = s + s * (t * u);
+    t = select((q & 1) != 0, 1.570796326794896557998982f - t, t);
+    t = select((q & 2) != 0, -t, t);
+    return t;
 }
-using fn_atan2 = internal::in_atan<>::fn_atan2;
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> atan2(const T1& y, const T2& x)
+
+template <size_t N>
+KFR_SINTRIN vec<f64, N> atan(vec<f64, N> s)
 {
-    return internal::in_atan<>::atan2(y, x);
+    vec<f64, N> t, u;
+    vec<i64, N> q;
+    q = select(s < 0.0, 2ll, 0ll);
+    s = select(s < 0.0, -s, s);
+    q = select(s > 1.0, q | 1, q);
+    s = select(s > 1.0, 1.0 / s, s);
+    t = s * s;
+    u = -1.88796008463073496563746e-05;
+    u = fmadd(u, t, 0.000209850076645816976906797);
+    u = fmadd(u, t, -0.00110611831486672482563471);
+    u = fmadd(u, t, 0.00370026744188713119232403);
+    u = fmadd(u, t, -0.00889896195887655491740809);
+    u = fmadd(u, t, 0.016599329773529201970117);
+    u = fmadd(u, t, -0.0254517624932312641616861);
+    u = fmadd(u, t, 0.0337852580001353069993897);
+    u = fmadd(u, t, -0.0407629191276836500001934);
+    u = fmadd(u, t, 0.0466667150077840625632675);
+    u = fmadd(u, t, -0.0523674852303482457616113);
+    u = fmadd(u, t, 0.0587666392926673580854313);
+    u = fmadd(u, t, -0.0666573579361080525984562);
+    u = fmadd(u, t, 0.0769219538311769618355029);
+    u = fmadd(u, t, -0.090908995008245008229153);
+    u = fmadd(u, t, 0.111111105648261418443745);
+    u = fmadd(u, t, -0.14285714266771329383765);
+    u = fmadd(u, t, 0.199999999996591265594148);
+    u = fmadd(u, t, -0.333333333333311110369124);
+    t = s + s * (t * u);
+    t = select((q & 1) != 0, 1.570796326794896557998982 - t, t);
+    t = select((q & 2) != 0, -t, t);
+    return t;
 }
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_atan2, E1, E2> atan2(E1&& y, E2&& x)
+
+template <typename T>
+KFR_SINTRIN T atandeg(const T& x)
 {
-    return { fn_atan2(), std::forward<E1>(y), std::forward<E2>(x) };
+    return atan(x) * c_radtodeg<T>;
+}
+
+template <typename T1, typename T2>
+KFR_SINTRIN common_type<T1, T2> atan2deg(const T1& y, const T2& x)
+{
+    return atan2(y, x) * c_radtodeg<common_type<T1, T2>>;
+}
+
+KFR_HANDLE_SCALAR(atan)
+KFR_HANDLE_SCALAR(atan2)
+KFR_FN(atan)
+KFR_FN(atandeg)
+KFR_FN(atan2)
+KFR_FN(atan2deg)
 }
-using fn_atandeg = internal::in_atan<>::fn_atandeg;
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRIN ftype<T1> atan(const T1& x)
+{
+    return internal::atan(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRIN expr_func<internal::fn_atan, E1> atan(E1&& x)
+{
+    return { {}, std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRIN ftype<T1> atandeg(const T1& x)
+{
+    return internal::atandeg(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRIN expr_func<internal::fn_atandeg, E1> atandeg(E1&& x)
+{
+    return { {}, std::forward<E1>(x) };
+}
+
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> atandeg(const T1& y, const T2& x)
+KFR_INTRIN common_type<T1, T2> atan2(const T1& x, const T2& y)
 {
-    return internal::in_atan<>::atandeg(y, x);
+    return internal::atan2(x, y);
 }
+
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_atandeg, E1, E2> atandeg(E1&& y, E2&& x)
+KFR_INTRIN expr_func<internal::fn_atan2, E1, E2> atan2(E1&& x, E2&& y)
 {
-    return { fn_atandeg(), std::forward<E1>(y), std::forward<E2>(x) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
-using fn_atan2deg = internal::in_atan<>::fn_atan2deg;
+
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> atan2deg(const T1& y, const T2& x)
+KFR_INTRIN common_type<T1, T2> atan2deg(const T1& x, const T2& y)
 {
-    return internal::in_atan<>::atan2deg(y, x);
+    return internal::atan2deg(x, y);
 }
+
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_atan2deg, E1, E2> atan2deg(E1&& y, E2&& x)
+KFR_INTRIN expr_func<internal::fn_atan2deg, E1, E2> atan2deg(E1&& x, E2&& y)
 {
-    return { fn_atan2deg(), std::forward<E1>(y), std::forward<E2>(x) };
-}
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
 }
-#pragma clang diagnostic pop
diff --git a/include/kfr/base/clamp.hpp b/include/kfr/base/clamp.hpp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "min_max.hpp"
+
+namespace kfr
+{
+
+namespace internal
+{
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> lo, vec<T, N> hi)
+{
+    return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> hi)
+{
+    return max(min(x, hi), zerovector<T, N>());
+}
+
+KFR_I_FN(clamp)
+}
+
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
+          typename Tout = common_type<T1, T2, T3>>
+KFR_INTRIN Tout clamp(const T1& x, const T2& lo, const T3& hi)
+{
+    return internal::clamp(static_cast<Tout>(x), static_cast<Tout>(lo), static_cast<Tout>(hi));
+}
+
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRIN expr_func<internal::fn_clamp, E1, E2, E3> clamp(E1&& x, E2&& lo, E3&& hi)
+{
+    return { {}, std::forward<E1>(x), std::forward<E2>(lo), std::forward<E3>(hi) };
+}
+
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRIN Tout clamp(const T1& x, const T2& hi)
+{
+    return internal::clamp(static_cast<Tout>(x), static_cast<Tout>(hi));
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRIN expr_func<internal::fn_clamp, E1, E2> clamp(E1&& x, E2&& hi)
+{
+    return { {}, std::forward<E1>(x), std::forward<E2>(hi) };
+}
+}
diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp
@@ -79,9 +79,31 @@ struct complex
     T re;
     T im;
 };
-
 #endif
 #endif
+}
+namespace cometa
+{
+template <typename T>
+struct compound_type_traits<kfr::complex<T>>
+{
+    constexpr static size_t width   = 2;
+    using subtype                   = T;
+    using deep_subtype              = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar = false;
+    template <typename U>
+    using rebind = kfr::complex<U>;
+    template <typename U>
+    using deep_rebind = kfr::complex<cometa::deep_rebind<subtype, U>>;
+
+    static constexpr subtype at(const kfr::complex<T>& value, size_t index)
+    {
+        return index == 0 ? value.real() : value.imag();
+    }
+};
+}
+namespace kfr
+{
 
 using c32   = complex<f32>;
 using c64   = complex<f64>;
@@ -262,363 +284,280 @@ constexpr KFR_INLINE complex<T> make_complex(T1 real, T2 imag = T2(0))
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native>
-struct in_complex : in_select<c>, in_sin_cos<c>, in_hyperbolic<c>, in_sqrt<c>, in_atan<c>, in_log_exp<c>
-{
-    constexpr static cpu_t cur = c;
-    using in_sqrt<c>::sqrt;
-    using in_sin_cos<c>::sincos;
-    using in_sin_cos<c>::cossin;
-    using in_hyperbolic<c>::sinhcosh;
-    using in_hyperbolic<c>::coshsinh;
-    using in_atan<c>::atan2;
-    using in_log_exp<c>::log;
-    using in_log_exp<c>::log2;
-    using in_log_exp<c>::log10;
-    using in_log_exp<c>::exp;
-    using in_log_exp<c>::exp2;
-    using in_log_exp<c>::exp10;
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> csin(const vec<complex<T>, N>& x)
-    {
-        return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> csinh(const vec<complex<T>, N>& x)
-    {
-        return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> ccos(const vec<complex<T>, N>& x)
-    {
-        return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x)
-    {
-        return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> cabs(const vec<complex<T>, N>& x)
-    {
-        const vec<T, N* 2> xx = sqr(cdecom(x));
-        return sqrt(even(xx) + odd(xx));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> carg(const vec<complex<T>, N>& x)
-    {
-        const vec<T, N* 2> xx = cdecom(x);
-        return atan2(even(xx), odd(xx));
-    }
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> csin(const vec<complex<T>, N>& x)
+{
+    return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> csinh(const vec<complex<T>, N>& x)
+{
+    return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> ccos(const vec<complex<T>, N>& x)
+{
+    return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))));
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x)
+{
+    return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
 
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> clog(const vec<complex<T>, N>& x)
-    {
-        return make_complex(log(cabs(x)), carg(x));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> clog2(const vec<complex<T>, N>& x)
-    {
-        return clog(x) * c_recip_log_2<T>;
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> clog10(const vec<complex<T>, N>& x)
-    {
-        return clog(x) * c_recip_log_10<T>;
-    }
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> cabs(const vec<complex<T>, N>& x)
+{
+    const vec<T, N* 2> xx = sqr(cdecom(x));
+    return sqrt(even(xx) + odd(xx));
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> carg(const vec<complex<T>, N>& x)
+{
+    const vec<T, N* 2> xx = cdecom(x);
+    return atan2(even(xx), odd(xx));
+}
 
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> cexp(const vec<complex<T>, N>& x)
-    {
-        return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x)
-    {
-        return cexp(x * c_log_2<T>);
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x)
-    {
-        return cexp(x * c_log_10<T>);
-    }
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> clog(const vec<complex<T>, N>& x)
+{
+    return make_complex(log(cabs(x)), carg(x));
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> clog2(const vec<complex<T>, N>& x)
+{
+    return clog(x) * c_recip_log_2<T>;
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> clog10(const vec<complex<T>, N>& x)
+{
+    return clog(x) * c_recip_log_10<T>;
+}
 
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> polar(const vec<complex<T>, N>& x)
-    {
-        return make_complex(cabs(x), carg(x));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x)
-    {
-        return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x))));
-    }
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> cexp(const vec<complex<T>, N>& x)
+{
+    return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x)
+{
+    return cexp(x * c_log_2<T>);
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x)
+{
+    return cexp(x * c_log_10<T>);
+}
 
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> cabsdup(vec<T, N> x)
-    {
-        x = sqr(x);
-        return sqrt(x + swap<2>(x));
-    }
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> polar(const vec<complex<T>, N>& x)
+{
+    return make_complex(cabs(x), carg(x));
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x)
+{
+    return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x))));
+}
 
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x)
-    {
-        const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5);
-        return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t));
-    }
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> cabsdup(vec<T, N> x)
+{
+    x = sqr(x);
+    return sqrt(x + swap<2>(x));
+}
 
-    KFR_HANDLE_SCALAR(csin)
-    KFR_HANDLE_SCALAR(csinh)
-    KFR_HANDLE_SCALAR(ccos)
-    KFR_HANDLE_SCALAR(ccosh)
-    KFR_HANDLE_SCALAR(cabs)
-    KFR_HANDLE_SCALAR(carg)
-    KFR_HANDLE_SCALAR(clog)
-    KFR_HANDLE_SCALAR(clog2)
-    KFR_HANDLE_SCALAR(clog10)
-    KFR_HANDLE_SCALAR(cexp)
-    KFR_HANDLE_SCALAR(cexp2)
-    KFR_HANDLE_SCALAR(cexp10)
-    KFR_HANDLE_SCALAR(polar)
-    KFR_HANDLE_SCALAR(cartesian)
-    KFR_HANDLE_SCALAR(csqrt)
-
-    KFR_SPEC_FN(in_complex, csin)
-    KFR_SPEC_FN(in_complex, csinh)
-    KFR_SPEC_FN(in_complex, ccos)
-    KFR_SPEC_FN(in_complex, ccosh)
-    KFR_SPEC_FN(in_complex, cabs)
-    KFR_SPEC_FN(in_complex, carg)
-    KFR_SPEC_FN(in_complex, clog)
-    KFR_SPEC_FN(in_complex, clog2)
-    KFR_SPEC_FN(in_complex, clog10)
-    KFR_SPEC_FN(in_complex, cexp)
-    KFR_SPEC_FN(in_complex, cexp2)
-    KFR_SPEC_FN(in_complex, cexp10)
-    KFR_SPEC_FN(in_complex, polar)
-    KFR_SPEC_FN(in_complex, cartesian)
-    KFR_SPEC_FN(in_complex, csqrt)
-};
+template <typename T, size_t N>
+KFR_SINTRIN vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x)
+{
+    const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5);
+    return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t));
+}
+
+KFR_HANDLE_SCALAR(csin)
+KFR_HANDLE_SCALAR(csinh)
+KFR_HANDLE_SCALAR(ccos)
+KFR_HANDLE_SCALAR(ccosh)
+KFR_HANDLE_SCALAR(cabs)
+KFR_HANDLE_SCALAR(carg)
+KFR_HANDLE_SCALAR(clog)
+KFR_HANDLE_SCALAR(clog2)
+KFR_HANDLE_SCALAR(clog10)
+KFR_HANDLE_SCALAR(cexp)
+KFR_HANDLE_SCALAR(cexp2)
+KFR_HANDLE_SCALAR(cexp10)
+KFR_HANDLE_SCALAR(polar)
+KFR_HANDLE_SCALAR(cartesian)
+KFR_HANDLE_SCALAR(csqrt)
+
+KFR_FN(csin)
+KFR_FN(csinh)
+KFR_FN(ccos)
+KFR_FN(ccosh)
+KFR_FN(cabs)
+KFR_FN(carg)
+KFR_FN(clog)
+KFR_FN(clog2)
+KFR_FN(clog10)
+KFR_FN(cexp)
+KFR_FN(cexp2)
+KFR_FN(cexp10)
+KFR_FN(polar)
+KFR_FN(cartesian)
+KFR_FN(csqrt)
 }
 
-namespace native
-{
-using fn_csin = internal::in_complex<>::fn_csin;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> csin(const T1& x)
+KFR_INTRIN T1 csin(const T1& x)
 {
-    return internal::in_complex<>::csin(x);
+    return internal::csin(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_csin, E1> csin(E1&& x)
+KFR_INTRIN expr_func<internal::fn_csin, E1> csin(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-
-using fn_csinh = internal::in_complex<>::fn_csinh;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> csinh(const T1& x)
+KFR_INTRIN T1 csinh(const T1& x)
 {
-    return internal::in_complex<>::csinh(x);
+    return internal::csinh(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_csinh, E1> csinh(E1&& x)
+KFR_INTRIN expr_func<internal::fn_csinh, E1> csinh(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-
-using fn_ccos = internal::in_complex<>::fn_ccos;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> ccos(const T1& x)
+KFR_INTRIN T1 ccos(const T1& x)
 {
-    return internal::in_complex<>::ccos(x);
+    return internal::ccos(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_ccos, E1> ccos(E1&& x)
+KFR_INTRIN expr_func<internal::fn_ccos, E1> ccos(E1&& x)
 {
-    return { fn_ccos(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_ccosh = internal::in_complex<>::fn_ccosh;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> ccosh(const T1& x)
+KFR_INTRIN T1 ccosh(const T1& x)
 {
-    return internal::in_complex<>::ccosh(x);
+    return internal::ccosh(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_ccosh, E1> ccosh(E1&& x)
+KFR_INTRIN expr_func<internal::fn_ccosh, E1> ccosh(E1&& x)
 {
-    return { fn_ccosh(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_cabs = internal::in_complex<>::fn_cabs;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN realftype<T1> cabs(const T1& x)
+KFR_INTRIN realtype<T1> cabs(const T1& x)
 {
-    return internal::in_complex<>::cabs(x);
+    return internal::cabs(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cabs, E1> cabs(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cabs, E1> cabs(E1&& x)
 {
-    return { fn_cabs(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_carg = internal::in_complex<>::fn_carg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN realftype<T1> carg(const T1& x)
+KFR_INTRIN realtype<T1> carg(const T1& x)
 {
-    return internal::in_complex<>::carg(x);
+    return internal::carg(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_carg, E1> carg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_carg, E1> carg(E1&& x)
 {
-    return { fn_carg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_clog = internal::in_complex<>::fn_clog;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> clog(const T1& x)
+KFR_INTRIN T1 clog(const T1& x)
 {
-    return internal::in_complex<>::clog(x);
+    return internal::clog(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_clog, E1> clog(E1&& x)
+KFR_INTRIN expr_func<internal::fn_clog, E1> clog(E1&& x)
 {
-    return { fn_clog(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_clog2 = internal::in_complex<>::fn_clog2;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> clog2(const T1& x)
+KFR_INTRIN T1 clog2(const T1& x)
 {
-    return internal::in_complex<>::clog2(x);
+    return internal::clog2(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_clog2, E1> clog2(E1&& x)
+KFR_INTRIN expr_func<internal::fn_clog2, E1> clog2(E1&& x)
 {
-    return { fn_clog2(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_clog10 = internal::in_complex<>::fn_clog10;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> clog10(const T1& x)
+KFR_INTRIN T1 clog10(const T1& x)
 {
-    return internal::in_complex<>::clog10(x);
+    return internal::clog10(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_clog10, E1> clog10(E1&& x)
+KFR_INTRIN expr_func<internal::fn_clog10, E1> clog10(E1&& x)
 {
-    return { fn_clog10(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_cexp = internal::in_complex<>::fn_cexp;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cexp(const T1& x)
+KFR_INTRIN T1 cexp(const T1& x)
 {
-    return internal::in_complex<>::cexp(x);
+    return internal::cexp(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cexp, E1> cexp(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cexp, E1> cexp(E1&& x)
 {
-    return { fn_cexp(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_cexp2 = internal::in_complex<>::fn_cexp2;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cexp2(const T1& x)
+KFR_INTRIN T1 cexp2(const T1& x)
 {
-    return internal::in_complex<>::cexp2(x);
+    return internal::cexp2(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cexp2, E1> cexp2(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cexp2, E1> cexp2(E1&& x)
 {
-    return { fn_cexp2(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_cexp10 = internal::in_complex<>::fn_cexp10;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cexp10(const T1& x)
+KFR_INTRIN T1 cexp10(const T1& x)
 {
-    return internal::in_complex<>::cexp10(x);
+    return internal::cexp10(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cexp10, E1> cexp10(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cexp10, E1> cexp10(E1&& x)
 {
-    return { fn_cexp10(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_polar = internal::in_complex<>::fn_polar;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> polar(const T1& x)
+KFR_INTRIN T1 polar(const T1& x)
 {
-    return internal::in_complex<>::polar(x);
+    return internal::polar(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_polar, E1> polar(E1&& x)
+KFR_INTRIN expr_func<internal::fn_polar, E1> polar(E1&& x)
 {
-    return { fn_polar(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_cartesian = internal::in_complex<>::fn_cartesian;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cartesian(const T1& x)
+KFR_INTRIN T1 cartesian(const T1& x)
 {
-    return internal::in_complex<>::cartesian(x);
+    return internal::cartesian(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cartesian, E1> cartesian(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cartesian, E1> cartesian(E1&& x)
 {
-    return { fn_cartesian(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
-
-using fn_csqrt = internal::in_complex<>::fn_csqrt;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> csqrt(const T1& x)
+KFR_INTRIN T1 csqrt(const T1& x)
 {
-    return internal::in_complex<>::csqrt(x);
+    return internal::csqrt(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_csqrt, E1> csqrt(E1&& x)
+KFR_INTRIN expr_func<internal::fn_csqrt, E1> csqrt(E1&& x)
 {
-    return { fn_csqrt(), std::forward<E1>(x) };
-}
-}
+    return { {}, std::forward<E1>(x) };
 }
-namespace cometa
-{
-template <typename T>
-struct compound_type_traits<kfr::complex<T>>
-{
-    constexpr static size_t width   = 2;
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar = false;
-    template <typename U>
-    using rebind = kfr::complex<U>;
-    template <typename U>
-    using deep_rebind = kfr::complex<cometa::deep_rebind<subtype, U>>;
-
-    static constexpr subtype at(const kfr::complex<T>& value, size_t index)
-    {
-        return index == 0 ? value.real() : value.imag();
-    }
-};
 }
 
 #pragma clang diagnostic pop
diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp
@@ -41,63 +41,51 @@ constexpr T gamma_precalc[] = {
     -0x1.62f981f01cf84p+8, 0x5.a937aa5c48d98p+0,  -0x3.c640bf82e2104p-8, 0xc.914c540f959cp-24,
 };
 
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_gamma : in_log_exp<cc>
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> gamma(vec<T, N> z)
 {
-private:
-    using in_log_exp<cc>::exp;
-    using in_log_exp<cc>::pow;
-
-public:
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> gamma(vec<T, N> z)
-    {
-        constexpr size_t Count = arraysize(internal::gamma_precalc<T>);
-        vec<T, N> accm = gamma_precalc<T>[0];
-        KFR_LOOP_UNROLL
-        for (size_t k = 1; k < Count; k++)
-            accm += gamma_precalc<T>[k] / (z + cast<utype<T>>(k));
-        accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5);
-        return accm / z;
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> factorial_approx(vec<T, N> x)
-    {
-        return gamma(x + T(1));
-    }
-    KFR_SPEC_FN(in_gamma, gamma)
-    KFR_SPEC_FN(in_gamma, factorial_approx)
-};
+    constexpr size_t Count = arraysize(gamma_precalc<T>);
+    vec<T, N> accm = gamma_precalc<T>[0];
+    KFR_LOOP_UNROLL
+    for (size_t k = 1; k < Count; k++)
+        accm += gamma_precalc<T>[k] / (z + cast<utype<T>>(k));
+    accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5);
+    return accm / z;
 }
 
-namespace native
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> factorial_approx(vec<T, N> x)
 {
-using fn_gamma = internal::in_gamma<>::fn_gamma;
+    return gamma(x + T(1));
+}
+KFR_HANDLE_SCALAR(gamma)
+KFR_HANDLE_SCALAR(factorial_approx)
+KFR_FN(gamma)
+KFR_FN(factorial_approx)
+}
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> gamma(const T1& x)
+KFR_INTRIN T1 gamma(const T1& x)
 {
-    return internal::in_gamma<>::gamma(x);
+    return internal::gamma(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_gamma, E1> gamma(E1&& x)
+KFR_INTRIN expr_func<internal::fn_gamma, E1> gamma(E1&& x)
 {
-    return { fn_gamma(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_factorial_approx = internal::in_gamma<>::fn_factorial_approx;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> factorial_approx(const T1& x)
+KFR_INTRIN T1 factorial_approx(const T1& x)
 {
-    return internal::in_gamma<>::factorial_approx(x);
+    return internal::factorial_approx(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_factorial_approx, E1> factorial_approx(E1&& x)
+KFR_INTRIN expr_func<internal::fn_factorial_approx, E1> factorial_approx(E1&& x)
 {
-    return { fn_factorial_approx(), std::forward<E1>(x) };
-}
+    return { {}, std::forward<E1>(x) };
 }
 }
 
diff --git a/include/kfr/base/hyperbolic.hpp b/include/kfr/base/hyperbolic.hpp
@@ -35,149 +35,131 @@ namespace kfr
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native>
-struct in_hyperbolic : in_log_exp<c>
-{
-    constexpr static cpu_t cur = c;
-
-private:
-    using in_log_exp<c>::exp;
-
-public:
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> sinh(vec<T, N> x)
-    {
-        return (exp(x) - exp(-x)) * T(0.5);
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> cosh(vec<T, N> x)
-    {
-        return (exp(x) + exp(-x)) * T(0.5);
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> tanh(vec<T, N> x)
-    {
-        x = -2 * x;
-        return (1 - exp(x)) / (1 + exp(x));
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> coth(vec<T, N> x)
-    {
-        x = -2 * x;
-        return (1 + exp(x)) / (1 - exp(x));
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
-    KFR_SINTRIN vec<T, N> sinhcosh(vec<T, N> x)
-    {
-        const vec<T, N> a = exp(x);
-        const vec<T, N> b = exp(-x);
-        return subadd(a, b) * T(0.5);
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
-    KFR_SINTRIN vec<T, N> coshsinh(vec<T, N> x)
-    {
-        const vec<T, N> a = exp(x);
-        const vec<T, N> b = exp(-x);
-        return addsub(a, b) * T(0.5);
-    }
-    KFR_HANDLE_SCALAR(sinh)
-    KFR_HANDLE_SCALAR(cosh)
-    KFR_HANDLE_SCALAR(tanh)
-    KFR_HANDLE_SCALAR(coth)
-    KFR_HANDLE_SCALAR(sinhcosh)
-    KFR_HANDLE_SCALAR(coshsinh)
-    KFR_SPEC_FN(in_hyperbolic, sinh)
-    KFR_SPEC_FN(in_hyperbolic, cosh)
-    KFR_SPEC_FN(in_hyperbolic, tanh)
-    KFR_SPEC_FN(in_hyperbolic, coth)
-    KFR_SPEC_FN(in_hyperbolic, sinhcosh)
-    KFR_SPEC_FN(in_hyperbolic, coshsinh)
-};
-}
-
-namespace native
-{
-using fn_sinh = internal::in_hyperbolic<>::fn_sinh;
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> sinh(vec<T, N> x)
+{
+    return (exp(x) - exp(-x)) * T(0.5);
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> cosh(vec<T, N> x)
+{
+    return (exp(x) + exp(-x)) * T(0.5);
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> tanh(vec<T, N> x)
+{
+    x = -2 * x;
+    return (1 - exp(x)) / (1 + exp(x));
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> coth(vec<T, N> x)
+{
+    x = -2 * x;
+    return (1 + exp(x)) / (1 - exp(x));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
+KFR_SINTRIN vec<T, N> sinhcosh(vec<T, N> x)
+{
+    const vec<T, N> a = exp(x);
+    const vec<T, N> b = exp(-x);
+    return subadd(a, b) * T(0.5);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
+KFR_SINTRIN vec<T, N> coshsinh(vec<T, N> x)
+{
+    const vec<T, N> a = exp(x);
+    const vec<T, N> b = exp(-x);
+    return addsub(a, b) * T(0.5);
+}
+
+KFR_HANDLE_SCALAR(sinh)
+KFR_HANDLE_SCALAR(cosh)
+KFR_HANDLE_SCALAR(tanh)
+KFR_HANDLE_SCALAR(coth)
+KFR_HANDLE_SCALAR(sinhcosh)
+KFR_HANDLE_SCALAR(coshsinh)
+KFR_FN(sinh)
+KFR_FN(cosh)
+KFR_FN(tanh)
+KFR_FN(coth)
+KFR_FN(sinhcosh)
+KFR_FN(coshsinh)
+}
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sinh(const T1& x)
+KFR_INTRIN T1 sinh(const T1& x)
 {
-    return internal::in_hyperbolic<>::sinh(x);
+    return internal::sinh(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sinh, E1> sinh(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sinh, E1> sinh(E1&& x)
 {
-    return { fn_sinh(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_cosh = internal::in_hyperbolic<>::fn_cosh;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cosh(const T1& x)
+KFR_INTRIN T1 cosh(const T1& x)
 {
-    return internal::in_hyperbolic<>::cosh(x);
+    return internal::cosh(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cosh, E1> cosh(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cosh, E1> cosh(E1&& x)
 {
-    return { fn_cosh(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_tanh = internal::in_hyperbolic<>::fn_tanh;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> tanh(const T1& x)
+KFR_INTRIN T1 tanh(const T1& x)
 {
-    return internal::in_hyperbolic<>::tanh(x);
+    return internal::tanh(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_tanh, E1> tanh(E1&& x)
+KFR_INTRIN expr_func<internal::fn_tanh, E1> tanh(E1&& x)
 {
-    return { fn_tanh(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_coth = internal::in_hyperbolic<>::fn_coth;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> coth(const T1& x)
+KFR_INTRIN T1 coth(const T1& x)
 {
-    return internal::in_hyperbolic<>::coth(x);
+    return internal::coth(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_coth, E1> coth(E1&& x)
+KFR_INTRIN expr_func<internal::fn_coth, E1> coth(E1&& x)
 {
-    return { fn_coth(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_sinhcosh = internal::in_hyperbolic<>::fn_sinhcosh;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sinhcosh(const T1& x)
+KFR_INTRIN T1 sinhcosh(const T1& x)
 {
-    return internal::in_hyperbolic<>::sinhcosh(x);
+    return internal::sinhcosh(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sinhcosh, E1> sinhcosh(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sinhcosh, E1> sinhcosh(E1&& x)
 {
-    return { fn_sinhcosh(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_coshsinh = internal::in_hyperbolic<>::fn_coshsinh;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> coshsinh(const T1& x)
+KFR_INTRIN T1 coshsinh(const T1& x)
 {
-    return internal::in_hyperbolic<>::coshsinh(x);
+    return internal::coshsinh(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_coshsinh, E1> coshsinh(E1&& x)
+KFR_INTRIN expr_func<internal::fn_coshsinh, E1> coshsinh(E1&& x)
 {
-    return { fn_coshsinh(), std::forward<E1>(x) };
-}
+    return { {}, std::forward<E1>(x) };
 }
 }
diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp
@@ -26,518 +26,490 @@
 #include "constants.hpp"
 #include "function.hpp"
 #include "min_max.hpp"
+#include "clamp.hpp"
 #include "operators.hpp"
 #include "round.hpp"
 #include "select.hpp"
 #include "shuffle.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native>
-struct in_log_exp : in_select<c>, in_min_max<c>, in_clamp<c>, in_round<c>, in_abs<c>
-{
-private:
-    constexpr static cpu_t cur = c;
-    using in_select<c>::select;
-    using in_round<c>::floor;
-    using in_clamp<c>::clamp;
-    using in_abs<c>::abs;
-
-public:
-    template <size_t N>
-    KFR_SINTRIN vec<i32, N> vilogbp1(vec<f32, N> d)
-    {
-        mask<i32, N> m = d < 5.421010862427522E-20f;
-        d = select(m, 1.8446744073709552E19f * d, d);
-        vec<i32, N> q = (ibitcast(d) >> 23) & 0xff;
-        q = select(m, q - (64 + 0x7e), q - 0x7e);
-        return q;
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<i64, N> vilogbp1(vec<f64, N> d)
-    {
-        mask<i64, N> m = d < 4.9090934652977266E-91;
-        d = select(m, 2.037035976334486E90 * d, d);
-        vec<i64, N> q = (ibitcast(d) >> 52) & 0x7ff;
-        q = select(m, q - (300 + 0x03fe), q - 0x03fe);
-        return q;
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> vldexpk(vec<f32, N> x, vec<i32, N> q)
-    {
-        vec<i32, N> m = q >> 31;
-        m = (((m + q) >> 6) - m) << 4;
-        q = q - (m << 2);
-        m = clamp(m + 0x7f, vec<i32, N>(0xff));
-        vec<f32, N> u = pow4(bitcast<f32>(cast<i32>(m) << 23));
-        return x * u * bitcast<f32>((cast<i32>(q + 0x7f)) << 23);
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> vldexpk(vec<f64, N> x, vec<i64, N> q)
-    {
-        vec<i64, N> m = q >> 31;
-        m = (((m + q) >> 9) - m) << 7;
-        q = q - (m << 2);
-        m = clamp(m + 0x3ff, i64(0x7ff));
-        vec<f64, N> u = pow4(bitcast<f64>(cast<i64>(m) << 52));
-        return x * u * bitcast<f64>((cast<i64>(q + 0x3ff)) << 52);
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> logb(vec<T, N> x)
-    {
-        return select(x == T(), -c_infinity<T>, cast<T>(vilogbp1(x) - 1));
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> log(vec<f32, N> d)
-    {
-        vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f );
-        vec<f32, N> m = vldexpk(d, -e);
-
-        vec<f32, N> x  = (m - 1.0f) / (m + 1.0f);
-        vec<f32, N> x2 = x * x;
-
-        vec<f32, N> sp = select(d < 0, c_qnan<f32>, c_neginfinity<f32>);
-
-        vec<f32, N> t = 0.2371599674224853515625f;
-        t = fmadd(t, x2, 0.285279005765914916992188f);
-        t = fmadd(t, x2, 0.400005519390106201171875f);
-        t = fmadd(t, x2, 0.666666567325592041015625f);
-        t = fmadd(t, x2, 2.0f);
-
-        x = x * t + c_log_2<f32> * cast<f32>(e);
-        x = select(d > 0, x, sp);
-
-        return x;
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> log(vec<f64, N> d)
-    {
-        vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 );
-        vec<f64, N> m = vldexpk(d, -e);
-
-        vec<f64, N> x  = (m - 1.0) / (m + 1.0);
-        vec<f64, N> x2 = x * x;
-
-        vec<f64, N> sp = select(d < 0, c_qnan<f64>, c_neginfinity<f64>);
-
-        vec<f64, N> t = 0.148197055177935105296783;
-        t = fmadd(t, x2, 0.153108178020442575739679);
-        t = fmadd(t, x2, 0.181837339521549679055568);
-        t = fmadd(t, x2, 0.22222194152736701733275);
-        t = fmadd(t, x2, 0.285714288030134544449368);
-        t = fmadd(t, x2, 0.399999999989941956712869);
-        t = fmadd(t, x2, 0.666666666666685503450651);
-        t = fmadd(t, x2, 2);
-
-        x = x * t + c_log_2<f64> * cast<f64>(e);
-        x = select(d > 0, x, sp);
-
-        return x;
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> log2(vec<T, N> x)
-    {
-        return log(x) * c_recip_log_2<T>;
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> log10(vec<T, N> x)
-    {
-        return log(x) * c_recip_log_10<T>;
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> exp(vec<f32, N> d)
-    {
-        const f32 ln2_part1 = 0.6931457519f;
-        const f32 ln2_part2 = 1.4286067653e-6f;
-
-        vec<i32, N> q = cast<i32>(floor(d * c_recip_log_2<f32>));
-        vec<f32, N> s, u;
-
-        s = fmadd(cast<f32>(q), -ln2_part1, d);
-        s = fmadd(cast<f32>(q), -ln2_part2, s);
-
-        const f32 c2 = 0.4999999105930328369140625f;
-        const f32 c3 = 0.166668415069580078125f;
-        const f32 c4 = 4.16539050638675689697265625e-2f;
-        const f32 c5 = 8.378830738365650177001953125e-3f;
-        const f32 c6 = 1.304379315115511417388916015625e-3f;
-        const f32 c7 = 2.7555381529964506626129150390625e-4f;
-
-        u = c7;
-        u = fmadd(u, s, c6);
-        u = fmadd(u, s, c5);
-        u = fmadd(u, s, c4);
-        u = fmadd(u, s, c3);
-        u = fmadd(u, s, c2);
-
-        u = s * s * u + s + 1.0f;
-        u = vldexpk(u, q);
-
-        u = select(d == c_neginfinity<f32>, 0.f, u);
-
-        return u;
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> exp(vec<f64, N> d)
-    {
-        const f64 ln2_part1 = 0.69314717501401901245;
-        const f64 ln2_part2 = 5.545926273775592108e-009;
-
-        vec<i64, N> q = cast<i64>(floor(d * c_recip_log_2<f64>));
-        vec<f64, N> s, u;
-
-        s = fmadd(cast<f64>(q), -ln2_part1, d);
-        s = fmadd(cast<f64>(q), -ln2_part2, s);
-
-        const f64 c2  = 0.499999999999994948485237955537741072475910186767578;
-        const f64 c3  = 0.166666666667024204739888659787538927048444747924805;
-        const f64 c4  = 4.16666666578945840693215529881854308769106864929199e-2;
-        const f64 c5  = 8.3333334397461874404333670440792047884315252304077e-3;
-        const f64 c6  = 1.3888881489747750223179290074426717183087021112442e-3;
-        const f64 c7  = 1.9841587032493949419205414574918222569976933300495e-4;
-        const f64 c8  = 2.47929324077393282239802768662784160369483288377523e-5;
-        const f64 c9  = 2.77076037925831049422552981864598109496000688523054e-6;
-        const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7;
-        const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8;
-
-        u = c11;
-        u = fmadd(u, s, c10);
-        u = fmadd(u, s, c9);
-        u = fmadd(u, s, c8);
-        u = fmadd(u, s, c7);
-        u = fmadd(u, s, c6);
-        u = fmadd(u, s, c5);
-        u = fmadd(u, s, c4);
-        u = fmadd(u, s, c3);
-        u = fmadd(u, s, c2);
-
-        u = s * s * u + s + 1.0;
-        u = vldexpk(u, q);
-
-        u = select(d == c_neginfinity<f64>, 0.0, u);
-
-        return u;
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> exp2(vec<T, N> x)
-    {
-        return exp(x * c_log_2<T>);
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> exp10(vec<T, N> x)
-    {
-        return exp(x * c_log_10<T>);
-    }
-
-    template <typename T1, typename T2>
-    KFR_SINTRIN common_type<T1, T2> logn(const T1& a, const T2& b)
-    {
-        return log(a) / log(b);
-    }
-
-    template <typename T1, typename T2>
-    KFR_SINTRIN common_type<T1, T2> logm(const T1& a, const T2& b)
-    {
-        return log(a) * b;
-    }
-
-    template <typename T1, typename T2, typename T3>
-    KFR_SINTRIN common_type<T1, T2, T3> exp_fmadd(const T1& x, const T2& m, const T3& a)
-    {
-        return exp(fmadd(x, m, a));
-    }
-
-    template <typename T1, typename T2, typename T3>
-    KFR_SINTRIN common_type<T1, T2, T3> log_fmadd(const T1& x, const T2& m, const T3& a)
-    {
-        return fmadd(log(x), m, a);
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> pow(vec<T, N> a, vec<T, N> b)
-    {
-        const vec<T, N> t       = exp(b * log(abs(a)));
-        const mask<T, N> isint  = floor(b) == b;
-        const mask<T, N> iseven = (cast<itype<T>>(b) & 1) == 0;
-        return select(a > T(), t,
-                      select(a == T(), T(1), select(isint, select(iseven, t, -t), broadcast<N>(c_qnan<T>))));
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> root(vec<T, N> x, vec<T, N> b)
-    {
-        return exp(reciprocal(b) * log(x));
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> cbrt(vec<T, N> x)
-    {
-        return pow<T, N>(x, T(0.333333333333333333333333333333333));
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> exp(vec<T, N> x)
-    {
-        return exp(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> exp2(vec<T, N> x)
-    {
-        return exp2(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> exp10(vec<T, N> x)
-    {
-        return exp10(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> log(vec<T, N> x)
-    {
-        return log(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> log2(vec<T, N> x)
-    {
-        return log2(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> log10(vec<T, N> x)
-    {
-        return log10(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> cbrt(vec<T, N> x)
-    {
-        return cbrt(cast<Tout>(x));
-    }
-
-    KFR_HANDLE_SCALAR(exp)
-    KFR_HANDLE_SCALAR(exp2)
-    KFR_HANDLE_SCALAR(exp10)
-    KFR_HANDLE_SCALAR(log)
-    KFR_HANDLE_SCALAR(log2)
-    KFR_HANDLE_SCALAR(log10)
-    KFR_HANDLE_SCALAR(logb)
-    KFR_HANDLE_SCALAR(pow)
-    KFR_HANDLE_SCALAR(root)
-    KFR_HANDLE_SCALAR(cbrt)
-
-    KFR_SPEC_FN(in_log_exp, exp)
-    KFR_SPEC_FN(in_log_exp, exp2)
-    KFR_SPEC_FN(in_log_exp, exp10)
-    KFR_SPEC_FN(in_log_exp, log)
-    KFR_SPEC_FN(in_log_exp, log2)
-    KFR_SPEC_FN(in_log_exp, log10)
-    KFR_SPEC_FN(in_log_exp, logb)
-    KFR_SPEC_FN(in_log_exp, logn)
-    KFR_SPEC_FN(in_log_exp, logm)
-    KFR_SPEC_FN(in_log_exp, exp_fmadd)
-    KFR_SPEC_FN(in_log_exp, log_fmadd)
-    KFR_SPEC_FN(in_log_exp, pow)
-    KFR_SPEC_FN(in_log_exp, root)
-    KFR_SPEC_FN(in_log_exp, cbrt)
-};
-}
-namespace native
-{
-using fn_exp = internal::in_log_exp<>::fn_exp;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> exp(const T1& x)
+template <size_t N>
+KFR_SINTRIN vec<i32, N> vilogbp1(vec<f32, N> d)
 {
-    return internal::in_log_exp<>::exp(x);
+    mask<i32, N> m = d < 5.421010862427522E-20f;
+    d = select(m, 1.8446744073709552E19f * d, d);
+    vec<i32, N> q = (ibitcast(d) >> 23) & 0xff;
+    q = select(m, q - (64 + 0x7e), q - 0x7e);
+    return q;
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_exp, E1> exp(E1&& x)
+template <size_t N>
+KFR_SINTRIN vec<i64, N> vilogbp1(vec<f64, N> d)
 {
-    return { fn_exp(), std::forward<E1>(x) };
+    mask<i64, N> m = d < 4.9090934652977266E-91;
+    d = select(m, 2.037035976334486E90 * d, d);
+    vec<i64, N> q = (ibitcast(d) >> 52) & 0x7ff;
+    q = select(m, q - (300 + 0x03fe), q - 0x03fe);
+    return q;
 }
 
-using fn_exp2 = internal::in_log_exp<>::fn_exp2;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> exp2(const T1& x)
+template <size_t N>
+KFR_SINTRIN vec<f32, N> vldexpk(vec<f32, N> x, vec<i32, N> q)
 {
-    return internal::in_log_exp<>::exp2(x);
+    vec<i32, N> m = q >> 31;
+    m = (((m + q) >> 6) - m) << 4;
+    q = q - (m << 2);
+    m = clamp(m + 0x7f, vec<i32, N>(0xff));
+    vec<f32, N> u = pow4(bitcast<f32>(cast<i32>(m) << 23));
+    return x * u * bitcast<f32>((cast<i32>(q + 0x7f)) << 23);
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_exp2, E1> exp2(E1&& x)
+template <size_t N>
+KFR_SINTRIN vec<f64, N> vldexpk(vec<f64, N> x, vec<i64, N> q)
+{
+    vec<i64, N> m = q >> 31;
+    m = (((m + q) >> 9) - m) << 7;
+    q = q - (m << 2);
+    m = clamp(m + 0x3ff, i64(0x7ff));
+    vec<f64, N> u = pow4(bitcast<f64>(cast<i64>(m) << 52));
+    return x * u * bitcast<f64>((cast<i64>(q + 0x3ff)) << 52);
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> logb(vec<T, N> x)
+{
+    return select(x == T(), -c_infinity<T>, cast<T>(vilogbp1(x) - 1));
+}
+
+template <size_t N>
+KFR_SINTRIN vec<f32, N> log(vec<f32, N> d)
+{
+    vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f );
+    vec<f32, N> m = vldexpk(d, -e);
+
+    vec<f32, N> x  = (m - 1.0f) / (m + 1.0f);
+    vec<f32, N> x2 = x * x;
+
+    vec<f32, N> sp = select(d < 0, c_qnan<f32>, c_neginfinity<f32>);
+
+    vec<f32, N> t = 0.2371599674224853515625f;
+    t = fmadd(t, x2, 0.285279005765914916992188f);
+    t = fmadd(t, x2, 0.400005519390106201171875f);
+    t = fmadd(t, x2, 0.666666567325592041015625f);
+    t = fmadd(t, x2, 2.0f);
+
+    x = x * t + c_log_2<f32> * cast<f32>(e);
+    x = select(d > 0, x, sp);
+
+    return x;
+}
+
+template <size_t N>
+KFR_SINTRIN vec<f64, N> log(vec<f64, N> d)
+{
+    vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 );
+    vec<f64, N> m = vldexpk(d, -e);
+
+    vec<f64, N> x  = (m - 1.0) / (m + 1.0);
+    vec<f64, N> x2 = x * x;
+
+    vec<f64, N> sp = select(d < 0, c_qnan<f64>, c_neginfinity<f64>);
+
+    vec<f64, N> t = 0.148197055177935105296783;
+    t = fmadd(t, x2, 0.153108178020442575739679);
+    t = fmadd(t, x2, 0.181837339521549679055568);
+    t = fmadd(t, x2, 0.22222194152736701733275);
+    t = fmadd(t, x2, 0.285714288030134544449368);
+    t = fmadd(t, x2, 0.399999999989941956712869);
+    t = fmadd(t, x2, 0.666666666666685503450651);
+    t = fmadd(t, x2, 2);
+
+    x = x * t + c_log_2<f64> * cast<f64>(e);
+    x = select(d > 0, x, sp);
+
+    return x;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> log2(vec<T, N> x)
+{
+    return log(x) * c_recip_log_2<T>;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> log10(vec<T, N> x)
+{
+    return log(x) * c_recip_log_10<T>;
+}
+
+template <size_t N>
+KFR_SINTRIN vec<f32, N> exp(vec<f32, N> d)
+{
+    const f32 ln2_part1 = 0.6931457519f;
+    const f32 ln2_part2 = 1.4286067653e-6f;
+
+    vec<i32, N> q = cast<i32>(floor(d * c_recip_log_2<f32>));
+    vec<f32, N> s, u;
+
+    s = fmadd(cast<f32>(q), -ln2_part1, d);
+    s = fmadd(cast<f32>(q), -ln2_part2, s);
+
+    const f32 c2 = 0.4999999105930328369140625f;
+    const f32 c3 = 0.166668415069580078125f;
+    const f32 c4 = 4.16539050638675689697265625e-2f;
+    const f32 c5 = 8.378830738365650177001953125e-3f;
+    const f32 c6 = 1.304379315115511417388916015625e-3f;
+    const f32 c7 = 2.7555381529964506626129150390625e-4f;
+
+    u = c7;
+    u = fmadd(u, s, c6);
+    u = fmadd(u, s, c5);
+    u = fmadd(u, s, c4);
+    u = fmadd(u, s, c3);
+    u = fmadd(u, s, c2);
+
+    u = s * s * u + s + 1.0f;
+    u = vldexpk(u, q);
+
+    u = select(d == c_neginfinity<f32>, 0.f, u);
+
+    return u;
+}
+
+template <size_t N>
+KFR_SINTRIN vec<f64, N> exp(vec<f64, N> d)
+{
+    const f64 ln2_part1 = 0.69314717501401901245;
+    const f64 ln2_part2 = 5.545926273775592108e-009;
+
+    vec<i64, N> q = cast<i64>(floor(d * c_recip_log_2<f64>));
+    vec<f64, N> s, u;
+
+    s = fmadd(cast<f64>(q), -ln2_part1, d);
+    s = fmadd(cast<f64>(q), -ln2_part2, s);
+
+    const f64 c2  = 0.499999999999994948485237955537741072475910186767578;
+    const f64 c3  = 0.166666666667024204739888659787538927048444747924805;
+    const f64 c4  = 4.16666666578945840693215529881854308769106864929199e-2;
+    const f64 c5  = 8.3333334397461874404333670440792047884315252304077e-3;
+    const f64 c6  = 1.3888881489747750223179290074426717183087021112442e-3;
+    const f64 c7  = 1.9841587032493949419205414574918222569976933300495e-4;
+    const f64 c8  = 2.47929324077393282239802768662784160369483288377523e-5;
+    const f64 c9  = 2.77076037925831049422552981864598109496000688523054e-6;
+    const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7;
+    const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8;
+
+    u = c11;
+    u = fmadd(u, s, c10);
+    u = fmadd(u, s, c9);
+    u = fmadd(u, s, c8);
+    u = fmadd(u, s, c7);
+    u = fmadd(u, s, c6);
+    u = fmadd(u, s, c5);
+    u = fmadd(u, s, c4);
+    u = fmadd(u, s, c3);
+    u = fmadd(u, s, c2);
+
+    u = s * s * u + s + 1.0;
+    u = vldexpk(u, q);
+
+    u = select(d == c_neginfinity<f64>, 0.0, u);
+
+    return u;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> exp2(vec<T, N> x)
+{
+    return exp(x * c_log_2<T>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> exp10(vec<T, N> x)
+{
+    return exp(x * c_log_10<T>);
+}
+
+template <typename T1, typename T2>
+KFR_SINTRIN common_type<T1, T2> logn(const T1& a, const T2& b)
+{
+    return log(a) / log(b);
+}
+
+template <typename T1, typename T2>
+KFR_SINTRIN common_type<T1, T2> logm(const T1& a, const T2& b)
+{
+    return log(a) * b;
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_SINTRIN common_type<T1, T2, T3> exp_fmadd(const T1& x, const T2& m, const T3& a)
 {
-    return { fn_exp2(), std::forward<E1>(x) };
+    return exp(fmadd(x, m, a));
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_SINTRIN common_type<T1, T2, T3> log_fmadd(const T1& x, const T2& m, const T3& a)
+{
+    return fmadd(log(x), m, a);
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> pow(vec<T, N> a, vec<T, N> b)
+{
+    const vec<T, N> t       = exp(b * log(abs(a)));
+    const mask<T, N> isint  = floor(b) == b;
+    const mask<T, N> iseven = (cast<itype<T>>(b) & 1) == 0;
+    return select(a > T(), t,
+                  select(a == T(), T(1), select(isint, select(iseven, t, -t), broadcast<N>(c_qnan<T>))));
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> root(vec<T, N> x, vec<T, N> b)
+{
+    return exp(reciprocal(b) * log(x));
+}
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> cbrt(vec<T, N> x)
+{
+    return pow<T, N>(x, T(0.333333333333333333333333333333333));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> exp(vec<T, N> x)
+{
+    return exp(cast<Tout>(x));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> exp2(vec<T, N> x)
+{
+    return exp2(cast<Tout>(x));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> exp10(vec<T, N> x)
+{
+    return exp10(cast<Tout>(x));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> log(vec<T, N> x)
+{
+    return log(cast<Tout>(x));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> log2(vec<T, N> x)
+{
+    return log2(cast<Tout>(x));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> log10(vec<T, N> x)
+{
+    return log10(cast<Tout>(x));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> cbrt(vec<T, N> x)
+{
+    return cbrt(cast<Tout>(x));
+}
+
+KFR_HANDLE_SCALAR(exp)
+KFR_HANDLE_SCALAR(exp2)
+KFR_HANDLE_SCALAR(exp10)
+KFR_HANDLE_SCALAR(log)
+KFR_HANDLE_SCALAR(log2)
+KFR_HANDLE_SCALAR(log10)
+KFR_HANDLE_SCALAR(logb)
+KFR_HANDLE_SCALAR(pow)
+KFR_HANDLE_SCALAR(root)
+KFR_HANDLE_SCALAR(cbrt)
+
+KFR_FN(exp)
+KFR_FN(exp2)
+KFR_FN(exp10)
+KFR_FN(log)
+KFR_FN(log2)
+KFR_FN(log10)
+KFR_FN(logb)
+KFR_FN(logn)
+KFR_FN(logm)
+KFR_FN(exp_fmadd)
+KFR_FN(log_fmadd)
+KFR_FN(pow)
+KFR_FN(root)
+KFR_FN(cbrt)
 }
 
-using fn_exp10 = internal::in_log_exp<>::fn_exp10;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> exp10(const T1& x)
+KFR_INTRIN T1 exp(const T1& x)
 {
-    return internal::in_log_exp<>::exp10(x);
+    return internal::exp(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_exp10, E1> exp10(E1&& x)
+KFR_INTRIN expr_func<internal::fn_exp, E1> exp(E1&& x)
 {
-    return { fn_exp10(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_log = internal::in_log_exp<>::fn_log;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> log(const T1& x)
+KFR_INTRIN T1 exp2(const T1& x)
 {
-    return internal::in_log_exp<>::log(x);
+    return internal::exp2(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_log, E1> log(E1&& x)
+KFR_INTRIN expr_func<internal::fn_exp2, E1> exp2(E1&& x)
 {
-    return { fn_log(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_log2 = internal::in_log_exp<>::fn_log2;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> log2(const T1& x)
+KFR_INTRIN T1 exp10(const T1& x)
 {
-    return internal::in_log_exp<>::log2(x);
+    return internal::exp10(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_log2, E1> log2(E1&& x)
+KFR_INTRIN expr_func<internal::fn_exp10, E1> exp10(E1&& x)
 {
-    return { fn_log2(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_log10 = internal::in_log_exp<>::fn_log10;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> log10(const T1& x)
+KFR_INTRIN T1 log(const T1& x)
 {
-    return internal::in_log_exp<>::log10(x);
+    return internal::log(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_log10, E1> log10(E1&& x)
+KFR_INTRIN expr_func<internal::fn_log, E1> log(E1&& x)
 {
-    return { fn_log10(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_logb = internal::in_log_exp<>::fn_logb;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> logb(const T1& x)
+KFR_INTRIN T1 log2(const T1& x)
 {
-    return internal::in_log_exp<>::logb(x);
+    return internal::log2(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_logb, E1> logb(E1&& x)
+KFR_INTRIN expr_func<internal::fn_log2, E1> log2(E1&& x)
 {
-    return { fn_logb(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_logn = internal::in_log_exp<>::fn_logn;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> logn(const T1& x)
+KFR_INTRIN T1 log10(const T1& x)
 {
-    return internal::in_log_exp<>::logn(x);
+    return internal::log10(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_logn, E1> logn(E1&& x)
+KFR_INTRIN expr_func<internal::fn_log10, E1> log10(E1&& x)
 {
-    return { fn_logn(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_logm = internal::in_log_exp<>::fn_logm;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> logm(const T1& x)
+KFR_INTRIN T1 logb(const T1& x)
 {
-    return internal::in_log_exp<>::logm(x);
+    return internal::logb(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_logm, E1> logm(E1&& x)
+KFR_INTRIN expr_func<internal::fn_logb, E1> logb(E1&& x)
+{
+    return { {}, std::forward<E1>(x) };
+}
+
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_INTRIN common_type<T1, T2> logn(const T1& x, const T2& y)
+{
+    return internal::logn(x, y);
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRIN expr_func<internal::fn_logn, E1, E2> logn(E1&& x, E2&& y)
+{
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_INTRIN common_type<T1, T2> logm(const T1& x, const T2& y)
 {
-    return { fn_logm(), std::forward<E1>(x) };
+    return internal::logm(x, y);
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRIN expr_func<internal::fn_logm, E1, E2> logm(E1&& x, E2&& y)
+{
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
 
-using fn_exp_fmadd = internal::in_log_exp<>::fn_exp_fmadd;
 template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INLINE ftype<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& m, const T3& a)
+KFR_INTRIN common_type<T1, T2, T3> exp_fmadd(const T1& x, const T2& y, const T3& z)
 {
-    return internal::in_log_exp<>::exp_fmadd(x, m, a);
+    return internal::exp_fmadd(x, y, z);
 }
 
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INLINE expr_func<fn_exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& m, E3&& a)
+KFR_INTRIN expr_func<internal::fn_exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& y, E3&& z)
 {
-    return { fn_exp_fmadd(), std::forward<E1>(x), std::forward<E2>(m), std::forward<E3>(a) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
 }
-using fn_log_fmadd = internal::in_log_exp<>::fn_log_fmadd;
+
 template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INLINE ftype<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& m, const T3& a)
+KFR_INTRIN common_type<T1, T2, T3> log_fmadd(const T1& x, const T2& y, const T3& z)
 {
-    return internal::in_log_exp<>::log_fmadd(x, m, a);
+    return internal::log_fmadd(x, y, z);
 }
 
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INLINE expr_func<fn_log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& m, E3&& a)
+KFR_INTRIN expr_func<internal::fn_log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& y, E3&& z)
 {
-    return { fn_log_fmadd(), std::forward<E1>(x), std::forward<E2>(m), std::forward<E3>(a) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
 }
 
-using fn_pow = internal::in_log_exp<>::fn_pow;
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> pow(const T1& x, const T2& b)
+KFR_INTRIN common_type<T1, T2> pow(const T1& x, const T2& y)
 {
-    return internal::in_log_exp<>::pow(x, b);
+    return internal::pow(x, y);
 }
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_pow, E1, E2> pow(E1&& x, E2&& b)
+KFR_INTRIN expr_func<internal::fn_pow, E1, E2> pow(E1&& x, E2&& y)
 {
-    return { fn_pow(), std::forward<E1>(x), std::forward<E2>(b) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
-using fn_root = internal::in_log_exp<>::fn_root;
+
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> root(const T1& x, const T2& b)
+KFR_INTRIN common_type<T1, T2> root(const T1& x, const T2& y)
 {
-    return internal::in_log_exp<>::root(x, b);
+    return internal::root(x, y);
 }
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_root, E1, E2> root(E1&& x, E2&& b)
+KFR_INTRIN expr_func<internal::fn_root, E1, E2> root(E1&& x, E2&& y)
 {
-    return { fn_root(), std::forward<E1>(x), std::forward<E2>(b) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
 
-using fn_cbrt = internal::in_log_exp<>::fn_cbrt;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cbrt(const T1& x)
+KFR_INTRIN T1 cbrt(const T1& x)
 {
-    return internal::in_log_exp<>::cbrt(x);
+    return internal::cbrt(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cbrt, E1> cbrt(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cbrt, E1> cbrt(E1&& x)
 {
-    return { fn_cbrt(), std::forward<E1>(x) };
-}
-}
+    return { {}, std::forward<E1>(x) };
 }
 
-#pragma clang diagnostic pop
+
+
+
+}
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -28,30 +28,22 @@
 namespace kfr
 {
 
+namespace internal
+{
+
 template <size_t bits>
 struct bitmask
 {
     using type = findinttype<0, (1ull << bits) - 1>;
+
     bitmask(type val) : value(val) {}
+
     template <typename Itype>
     bitmask(Itype val) : value(static_cast<type>(val))
     {
     }
-    type value;
-};
-
-namespace internal
-{
 
-template <cpu_t c = cpu_t::native>
-struct in_bittest : in_bittest<older(c)>
-{
-    struct fn_bittestnone : fn_disabled
-    {
-    };
-    struct fn_bittestall : fn_disabled
-    {
-    };
+    type value;
 };
 
 struct logical_and
@@ -61,6 +53,7 @@ struct logical_and
     {
         return x && y;
     }
+
     template <typename T>
     T operator()(initialvalue<T>)
     {
@@ -68,307 +61,190 @@ struct logical_and
     }
 };
 
-template <>
-struct in_bittest<cpu_t::common>
-{
-    constexpr static cpu_t cpu = cpu_t::common;
-
-    template <typename T, size_t N>
-    KFR_SINTRIN bitmask<N> getmask(vec<T, N> x)
-    {
-        typename bitmask<N>::type val = 0;
-        for (size_t i = 0; i < N; i++)
-        {
-            val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
-        }
-        return val;
-    }
+#if defined CID_ARCH_SSE41
+
+KFR_SINTRIN bool bittestnone(f32sse x, f32sse y) { return _mm_testz_ps(*x, *y); }
+KFR_SINTRIN bool bittestnone(f64sse x, f64sse y) { return _mm_testz_pd(*x, *y); }
+KFR_SINTRIN bool bittestnone(u8sse x, u8sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(u16sse x, u16sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(u32sse x, u32sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(u64sse x, u64sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(i8sse x, i8sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(i16sse x, i16sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(i32sse x, i32sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(i64sse x, i64sse y) { return _mm_testz_si128(*x, *y); }
+KFR_SINTRIN bool bittestnone(f32sse x) { return _mm_testz_ps(*x, *x); }
+KFR_SINTRIN bool bittestnone(f64sse x) { return _mm_testz_pd(*x, *x); }
+KFR_SINTRIN bool bittestnone(u8sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestnone(u16sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestnone(u32sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestnone(u64sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestnone(i8sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestnone(i16sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestnone(i32sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestnone(i64sse x) { return _mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestall(f32sse x, f32sse y) { return _mm_testc_ps(*x, *y); }
+KFR_SINTRIN bool bittestall(f64sse x, f64sse y) { return _mm_testc_pd(*x, *y); }
+KFR_SINTRIN bool bittestall(u8sse x, u8sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(u16sse x, u16sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(u32sse x, u32sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(u64sse x, u64sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(i8sse x, i8sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(i16sse x, i16sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(i32sse x, i32sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(i64sse x, i64sse y) { return _mm_testc_si128(*x, *y); }
+KFR_SINTRIN bool bittestall(f32sse x) { return _mm_testc_ps(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(f64sse x) { return _mm_testc_pd(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(u8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(u16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(u32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(u64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+
+#if defined CID_ARCH_AVX
+KFR_SINTRIN bool bittestnone(f32avx x, f32avx y) { return _mm256_testz_ps(*x, *y); }
+KFR_SINTRIN bool bittestnone(f64avx x, f64avx y) { return _mm256_testz_pd(*x, *y); }
+KFR_SINTRIN bool bittestnone(f32avx x) { return _mm256_testz_ps(*x, *x); }
+KFR_SINTRIN bool bittestnone(f64avx x) { return _mm256_testz_pd(*x, *x); }
+KFR_SINTRIN bool bittestnall(f32avx x, f32avx y) { return _mm256_testc_ps(*x, *y); }
+KFR_SINTRIN bool bittestnall(f64avx x, f64avx y) { return _mm256_testc_pd(*x, *y); }
+KFR_SINTRIN bool bittestnall(f32avx x) { return _mm256_testc_ps(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestnall(f64avx x) { return _mm256_testc_pd(*x, *allonesvector(x)); }
+#endif
 
-    template <typename T, size_t N>
-    KFR_SINTRIN bool bittestnone(vec<T, N> x)
-    {
-        return !getmask(x).value;
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN bool bittestnone(vec<T, N> x, vec<T, N> y)
-    {
-        return bittestnone(x & y);
-    }
+#if defined CID_ARCH_AVX2
+KFR_SINTRIN bool bittestnone(u8avx x, u8avx y) { return _mm256_testz_si256(*x, *y); }
+KFR_SINTRIN bool bittestnone(u16avx x, u16avx y) { return _mm256_testz_si256(*x, *y); }
+KFR_SINTRIN bool bittestnone(u32avx x, u32avx y) { return _mm256_testz_si256(*x, *y); }
+KFR_SINTRIN bool bittestnone(u64avx x, u64avx y) { return _mm256_testz_si256(*x, *y); }
+KFR_SINTRIN bool bittestnone(i8avx x, i8avx y) { return _mm256_testz_si256(*x, *y); }
+KFR_SINTRIN bool bittestnone(i16avx x, i16avx y) { return _mm256_testz_si256(*x, *y); }
+KFR_SINTRIN bool bittestnone(i32avx x, i32avx y) { return _mm256_testz_si256(*x, *y); }
+KFR_SINTRIN bool bittestnone(i64avx x, i64avx y) { return _mm256_testz_si256(*x, *y); }
+
+KFR_SINTRIN bool bittestnone(u8avx x) { return _mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestnone(u16avx x) { return _mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestnone(u32avx x) { return _mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestnone(u64avx x) { return _mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestnone(i8avx x) { return _mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestnone(i16avx x) { return _mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestnone(i32avx x) { return _mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestnone(i64avx x) { return _mm256_testz_si256(*x, *x); }
+
+KFR_SINTRIN bool bittestall(u8avx x, u8avx y) { return _mm256_testc_si256(*x, *y); }
+KFR_SINTRIN bool bittestall(u16avx x, u16avx y) { return _mm256_testc_si256(*x, *y); }
+KFR_SINTRIN bool bittestall(u32avx x, u32avx y) { return _mm256_testc_si256(*x, *y); }
+KFR_SINTRIN bool bittestall(u64avx x, u64avx y) { return _mm256_testc_si256(*x, *y); }
+KFR_SINTRIN bool bittestall(i8avx x, i8avx y) { return _mm256_testc_si256(*x, *y); }
+KFR_SINTRIN bool bittestall(i16avx x, i16avx y) { return _mm256_testc_si256(*x, *y); }
+KFR_SINTRIN bool bittestall(i32avx x, i32avx y) { return _mm256_testc_si256(*x, *y); }
+KFR_SINTRIN bool bittestall(i64avx x, i64avx y) { return _mm256_testc_si256(*x, *y); }
+
+KFR_SINTRIN bool bittestall(u8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(u16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(u32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(u64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(i64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+#endif
 
-    template <typename T, size_t N>
-    KFR_SINTRIN bool bittestall(vec<T, N> x)
-    {
-        return !getmask(~x).value;
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN bool bittestall(vec<T, N> x, vec<T, N> y)
+#elif defined CID_ARCH_SSE2
+
+KFR_SINTRIN bool bittestnone(f32sse x) { return !_mm_movemask_ps(*x); }
+KFR_SINTRIN bool bittestnone(f64sse x) { return !_mm_movemask_pd(*x); }
+KFR_SINTRIN bool bittestnone(u8sse x) { return !_mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestnone(u16sse x) { return !_mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestnone(u32sse x) { return !_mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestnone(u64sse x) { return !_mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestnone(i8sse x) { return !_mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestnone(i16sse x) { return !_mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestnone(i32sse x) { return !_mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestnone(i64sse x) { return !_mm_movemask_epi8(*x); }
+
+KFR_SINTRIN bool bittestnone(f32sse x, f32sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(f64sse x, f64sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(u8sse x, u8sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(u16sse x, u16sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(u32sse x, u32sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(u64sse x, u64sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(i8sse x, i8sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(i16sse x, i16sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(i32sse x, i32sse y) { return bittestnone(x & y); }
+KFR_SINTRIN bool bittestnone(i64sse x, i64sse y) { return bittestnone(x & y); }
+
+KFR_SINTRIN bool bittestall(f32sse x) { return !_mm_movemask_ps(*~x); }
+KFR_SINTRIN bool bittestall(f64sse x) { return !_mm_movemask_pd(*~x); }
+KFR_SINTRIN bool bittestall(u8sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(u16sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(u32sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(u64sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(i8sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(i16sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(i32sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(i64sse x) { return !_mm_movemask_epi8(*~x); }
+
+KFR_SINTRIN bool bittestall(f32sse x, f32sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(f64sse x, f64sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(u8sse x, u8sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(u16sse x, u16sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(u32sse x, u32sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(u64sse x, u64sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(i8sse x, i8sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(i16sse x, i16sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(i32sse x, i32sse y) { return bittestnone(~x & y); }
+KFR_SINTRIN bool bittestall(i64sse x, i64sse y) { return bittestnone(~x & y); }
+
+#else
+
+template <typename T, size_t N>
+KFR_SINTRIN bitmask<N> getmask(vec<T, N> x)
+{
+    typename bitmask<N>::type val = 0;
+    for (size_t i = 0; i < N; i++)
     {
-        return bittestnone(~x & y);
+        val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
     }
-
-    KFR_SPEC_FN(in_bittest, bittestnone)
-    KFR_SPEC_FN(in_bittest, bittestall)
-};
-
-#ifdef CID_ARCH_X86
-
-template <>
-struct in_bittest<cpu_t::sse2>
-{
-    constexpr static cpu_t cpu = cpu_t::sse2;
-
-    KFR_SINTRIN bitmask<4> getmask(f32sse x) { return bitmask<4>(_mm_movemask_pd(*x)); }
-    KFR_SINTRIN bitmask<4> getmask(f64sse x) { return bitmask<4>(_mm_movemask_pd(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(u8sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(u16sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(u32sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(u64sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(i8sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(i16sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(i32sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<16> getmask(i64sse x) { return bitmask<16>(_mm_movemask_epi8(*x)); }
-
-    KFR_SINTRIN bool bittestnone(f32sse x) { return !_mm_movemask_ps(*x); }
-    KFR_SINTRIN bool bittestnone(f64sse x) { return !_mm_movemask_pd(*x); }
-    KFR_SINTRIN bool bittestnone(u8sse x) { return !_mm_movemask_epi8(*x); }
-    KFR_SINTRIN bool bittestnone(u16sse x) { return !_mm_movemask_epi8(*x); }
-    KFR_SINTRIN bool bittestnone(u32sse x) { return !_mm_movemask_epi8(*x); }
-    KFR_SINTRIN bool bittestnone(u64sse x) { return !_mm_movemask_epi8(*x); }
-    KFR_SINTRIN bool bittestnone(i8sse x) { return !_mm_movemask_epi8(*x); }
-    KFR_SINTRIN bool bittestnone(i16sse x) { return !_mm_movemask_epi8(*x); }
-    KFR_SINTRIN bool bittestnone(i32sse x) { return !_mm_movemask_epi8(*x); }
-    KFR_SINTRIN bool bittestnone(i64sse x) { return !_mm_movemask_epi8(*x); }
-
-    KFR_SINTRIN bool bittestnone(f32sse x, f32sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(f64sse x, f64sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(u8sse x, u8sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(u16sse x, u16sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(u32sse x, u32sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(u64sse x, u64sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(i8sse x, i8sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(i16sse x, i16sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(i32sse x, i32sse y) { return bittestnone(x & y); }
-    KFR_SINTRIN bool bittestnone(i64sse x, i64sse y) { return bittestnone(x & y); }
-
-    KFR_SINTRIN bool bittestall(f32sse x) { return !_mm_movemask_ps(*~x); }
-    KFR_SINTRIN bool bittestall(f64sse x) { return !_mm_movemask_pd(*~x); }
-    KFR_SINTRIN bool bittestall(u8sse x) { return !_mm_movemask_epi8(*~x); }
-    KFR_SINTRIN bool bittestall(u16sse x) { return !_mm_movemask_epi8(*~x); }
-    KFR_SINTRIN bool bittestall(u32sse x) { return !_mm_movemask_epi8(*~x); }
-    KFR_SINTRIN bool bittestall(u64sse x) { return !_mm_movemask_epi8(*~x); }
-    KFR_SINTRIN bool bittestall(i8sse x) { return !_mm_movemask_epi8(*~x); }
-    KFR_SINTRIN bool bittestall(i16sse x) { return !_mm_movemask_epi8(*~x); }
-    KFR_SINTRIN bool bittestall(i32sse x) { return !_mm_movemask_epi8(*~x); }
-    KFR_SINTRIN bool bittestall(i64sse x) { return !_mm_movemask_epi8(*~x); }
-
-    KFR_SINTRIN bool bittestall(f32sse x, f32sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(f64sse x, f64sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(u8sse x, u8sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(u16sse x, u16sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(u32sse x, u32sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(u64sse x, u64sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(i8sse x, i8sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(i16sse x, i16sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(i32sse x, i32sse y) { return bittestnone(~x & y); }
-    KFR_SINTRIN bool bittestall(i64sse x, i64sse y) { return bittestnone(~x & y); }
-
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestnone)
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestall)
-    KFR_SPEC_FN(in_bittest, bittestnone)
-    KFR_SPEC_FN(in_bittest, bittestall)
-};
-
-template <>
-struct in_bittest<cpu_t::sse41> : in_bittest<cpu_t::sse2>
-{
-    constexpr static cpu_t cpu = cpu_t::sse41;
-
-    KFR_SINTRIN bool bittestnone(f32sse x, f32sse y) { return _mm_testz_ps(*x, *y); }
-    KFR_SINTRIN bool bittestnone(f64sse x, f64sse y) { return _mm_testz_pd(*x, *y); }
-    KFR_SINTRIN bool bittestnone(u8sse x, u8sse y) { return _mm_testz_si128(*x, *y); }
-    KFR_SINTRIN bool bittestnone(u16sse x, u16sse y) { return _mm_testz_si128(*x, *y); }
-    KFR_SINTRIN bool bittestnone(u32sse x, u32sse y) { return _mm_testz_si128(*x, *y); }
-    KFR_SINTRIN bool bittestnone(u64sse x, u64sse y) { return _mm_testz_si128(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i8sse x, i8sse y) { return _mm_testz_si128(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i16sse x, i16sse y) { return _mm_testz_si128(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i32sse x, i32sse y) { return _mm_testz_si128(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i64sse x, i64sse y) { return _mm_testz_si128(*x, *y); }
-
-    KFR_SINTRIN bool bittestnone(f32sse x) { return _mm_testz_ps(*x, *x); }
-    KFR_SINTRIN bool bittestnone(f64sse x) { return _mm_testz_pd(*x, *x); }
-    KFR_SINTRIN bool bittestnone(u8sse x) { return _mm_testz_si128(*x, *x); }
-    KFR_SINTRIN bool bittestnone(u16sse x) { return _mm_testz_si128(*x, *x); }
-    KFR_SINTRIN bool bittestnone(u32sse x) { return _mm_testz_si128(*x, *x); }
-    KFR_SINTRIN bool bittestnone(u64sse x) { return _mm_testz_si128(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i8sse x) { return _mm_testz_si128(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i16sse x) { return _mm_testz_si128(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i32sse x) { return _mm_testz_si128(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i64sse x) { return _mm_testz_si128(*x, *x); }
-
-    KFR_SINTRIN bool bittestall(f32sse x, f32sse y) { return _mm_testc_ps(*x, *y); }
-    KFR_SINTRIN bool bittestall(f64sse x, f64sse y) { return _mm_testc_pd(*x, *y); }
-    KFR_SINTRIN bool bittestall(u8sse x, u8sse y) { return _mm_testc_si128(*x, *y); }
-    KFR_SINTRIN bool bittestall(u16sse x, u16sse y) { return _mm_testc_si128(*x, *y); }
-    KFR_SINTRIN bool bittestall(u32sse x, u32sse y) { return _mm_testc_si128(*x, *y); }
-    KFR_SINTRIN bool bittestall(u64sse x, u64sse y) { return _mm_testc_si128(*x, *y); }
-    KFR_SINTRIN bool bittestall(i8sse x, i8sse y) { return _mm_testc_si128(*x, *y); }
-    KFR_SINTRIN bool bittestall(i16sse x, i16sse y) { return _mm_testc_si128(*x, *y); }
-    KFR_SINTRIN bool bittestall(i32sse x, i32sse y) { return _mm_testc_si128(*x, *y); }
-    KFR_SINTRIN bool bittestall(i64sse x, i64sse y) { return _mm_testc_si128(*x, *y); }
-
-    KFR_SINTRIN bool bittestall(f32sse x) { return _mm_testc_ps(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(f64sse x) { return _mm_testc_pd(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(u8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(u16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(u32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(u64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestnone)
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestall)
-    KFR_SPEC_FN(in_bittest, bittestnone)
-    KFR_SPEC_FN(in_bittest, bittestall)
-};
-
-template <>
-struct in_bittest<cpu_t::avx1> : in_bittest<cpu_t::sse41>
-{
-    constexpr static cpu_t cpu = cpu_t::avx1;
-    using in_bittest<cpu_t::sse41>::bittestnone;
-    using in_bittest<cpu_t::sse41>::bittestall;
-
-    KFR_SINTRIN bitmask<8> getmask(f32avx x) { return bitmask<8>(_mm256_movemask_pd(*x)); }
-    KFR_SINTRIN bitmask<8> getmask(f64avx x) { return bitmask<8>(_mm256_movemask_pd(*x)); }
-
-    KFR_SINTRIN bool bittestnone(f32avx x, f32avx y) { return _mm256_testz_ps(*x, *y); }
-    KFR_SINTRIN bool bittestnone(f64avx x, f64avx y) { return _mm256_testz_pd(*x, *y); }
-    KFR_SINTRIN bool bittestnone(f32avx x) { return _mm256_testz_ps(*x, *x); }
-    KFR_SINTRIN bool bittestnone(f64avx x) { return _mm256_testz_pd(*x, *x); }
-    KFR_SINTRIN bool bittestnall(f32avx x, f32avx y) { return _mm256_testc_ps(*x, *y); }
-    KFR_SINTRIN bool bittestnall(f64avx x, f64avx y) { return _mm256_testc_pd(*x, *y); }
-    KFR_SINTRIN bool bittestnall(f32avx x) { return _mm256_testc_ps(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestnall(f64avx x) { return _mm256_testc_pd(*x, *allonesvector(x)); }
-
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestnone)
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestall)
-    KFR_SPEC_FN(in_bittest, bittestnone)
-    KFR_SPEC_FN(in_bittest, bittestall)
-};
-
-template <>
-struct in_bittest<cpu_t::avx2> : in_bittest<cpu_t::avx1>
-{
-    constexpr static cpu_t cpu = cpu_t::avx2;
-    using in_bittest<cpu_t::avx1>::bittestnone;
-    using in_bittest<cpu_t::avx1>::bittestall;
-
-    KFR_SINTRIN bitmask<32> getmask(u8avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<32> getmask(u16avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<32> getmask(u32avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<32> getmask(u64avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<32> getmask(i8avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<32> getmask(i16avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<32> getmask(i32avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-    KFR_SINTRIN bitmask<32> getmask(i64avx x) { return bitmask<32>(_mm256_movemask_epi8(*x)); }
-
-    KFR_SINTRIN bool bittestnone(u8avx x, u8avx y) { return _mm256_testz_si256(*x, *y); }
-    KFR_SINTRIN bool bittestnone(u16avx x, u16avx y) { return _mm256_testz_si256(*x, *y); }
-    KFR_SINTRIN bool bittestnone(u32avx x, u32avx y) { return _mm256_testz_si256(*x, *y); }
-    KFR_SINTRIN bool bittestnone(u64avx x, u64avx y) { return _mm256_testz_si256(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i8avx x, i8avx y) { return _mm256_testz_si256(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i16avx x, i16avx y) { return _mm256_testz_si256(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i32avx x, i32avx y) { return _mm256_testz_si256(*x, *y); }
-    KFR_SINTRIN bool bittestnone(i64avx x, i64avx y) { return _mm256_testz_si256(*x, *y); }
-
-    KFR_SINTRIN bool bittestnone(u8avx x) { return _mm256_testz_si256(*x, *x); }
-    KFR_SINTRIN bool bittestnone(u16avx x) { return _mm256_testz_si256(*x, *x); }
-    KFR_SINTRIN bool bittestnone(u32avx x) { return _mm256_testz_si256(*x, *x); }
-    KFR_SINTRIN bool bittestnone(u64avx x) { return _mm256_testz_si256(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i8avx x) { return _mm256_testz_si256(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i16avx x) { return _mm256_testz_si256(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i32avx x) { return _mm256_testz_si256(*x, *x); }
-    KFR_SINTRIN bool bittestnone(i64avx x) { return _mm256_testz_si256(*x, *x); }
-
-    KFR_SINTRIN bool bittestall(u8avx x, u8avx y) { return _mm256_testc_si256(*x, *y); }
-    KFR_SINTRIN bool bittestall(u16avx x, u16avx y) { return _mm256_testc_si256(*x, *y); }
-    KFR_SINTRIN bool bittestall(u32avx x, u32avx y) { return _mm256_testc_si256(*x, *y); }
-    KFR_SINTRIN bool bittestall(u64avx x, u64avx y) { return _mm256_testc_si256(*x, *y); }
-    KFR_SINTRIN bool bittestall(i8avx x, i8avx y) { return _mm256_testc_si256(*x, *y); }
-    KFR_SINTRIN bool bittestall(i16avx x, i16avx y) { return _mm256_testc_si256(*x, *y); }
-    KFR_SINTRIN bool bittestall(i32avx x, i32avx y) { return _mm256_testc_si256(*x, *y); }
-    KFR_SINTRIN bool bittestall(i64avx x, i64avx y) { return _mm256_testc_si256(*x, *y); }
-
-    KFR_SINTRIN bool bittestall(u8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(u16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(u32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(u64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-    KFR_SINTRIN bool bittestall(i64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestnone)
-    KFR_HANDLE_ALL_REDUCE(logical_and, bittestall)
-    KFR_SPEC_FN(in_bittest, bittestnone)
-    KFR_SPEC_FN(in_bittest, bittestall)
-};
-#endif
+    return val;
 }
 
-namespace native
-{
-using fn_bittestnone = internal::in_bittest<>::fn_bittestnone;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> bittestnone(const T1& x)
+template <typename T, size_t N>
+KFR_SINTRIN bool bittestnone(vec<T, N> x)
 {
-    return internal::in_bittest<>::bittestnone(x);
+    return !getmask(x).value;
 }
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_bittestnone, E1> bittestnone(E1&& x)
+template <typename T, size_t N>
+KFR_SINTRIN bool bittestnone(vec<T, N> x, vec<T, N> y)
 {
-    return { fn_bittestnone(), std::forward<E1>(x) };
+    return bittestnone(x & y);
 }
 
-using fn_bittestall = internal::in_bittest<>::fn_bittestall;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> bittestall(const T1& x)
+template <typename T, size_t N>
+KFR_SINTRIN bool bittestall(vec<T, N> x)
 {
-    return internal::in_bittest<>::bittestall(x);
+    return !getmask(~x).value;
 }
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_bittestall, E1> bittestall(E1&& x)
+template <typename T, size_t N>
+KFR_SINTRIN bool bittestall(vec<T, N> x, vec<T, N> y)
 {
-    return { fn_bittestall(), std::forward<E1>(x) };
+    return bittestnone(~x & y);
 }
-
-using fn_bittestnone = internal::in_bittest<>::fn_bittestnone;
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> bittestnone(const T1& x, const T2& y)
-{
-    return internal::in_bittest<>::bittestnone(x, y);
+#endif
 }
 
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_bittestnone, E1, E2> bittestnone(E1&& x, E2&& y)
-{
-    return { fn_bittestnone(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-using fn_bittestall = internal::in_bittest<>::fn_bittestall;
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> bittestall(const T1& x, const T2& y)
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRIN T1 bittestnone(const T1& x)
 {
-    return internal::in_bittest<>::bittestall(x, y);
+    return internal::bittestnone(x);
 }
 
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_bittestall, E1, E2> bittestall(E1&& x, E2&& y)
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRIN T1 bittestall(const T1& x)
 {
-    return { fn_bittestall(), std::forward<E1>(x), std::forward<E2>(y) };
-}
+    return internal::bittestall(x);
 }
 }
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -27,383 +27,173 @@
 #include "operators.hpp"
 #include "select.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
 namespace internal
 {
 
-template <cpu_t cpu = cpu_t::native, cpu_t cc = cpu>
-struct in_min_max : in_min_max<older(cpu), cc>
-{
-    struct fn_min : in_min_max<older(cpu), cc>::fn_min, fn_disabled
-    {
-    };
-    struct fn_max : in_min_max<older(cpu), cc>::fn_max, fn_disabled
-    {
-    };
-};
-
-template <cpu_t cc>
-struct in_min_max<cpu_t::common, cc> : in_select<cc>
-{
-    constexpr static cpu_t cpu = cpu_t::common;
-
-private:
-    using in_select<cc>::select;
-
-public:
-    template <typename T>
-    KFR_SINTRIN T min(initialvalue<T>)
-    {
-        return std::numeric_limits<T>::max();
-    }
-    template <typename T>
-    KFR_SINTRIN T max(initialvalue<T>)
-    {
-        return std::numeric_limits<T>::min();
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> min(vec<T, N> x, vec<T, N> y)
-    {
-        return select(x < y, x, y);
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> max(vec<T, N> x, vec<T, N> y)
-    {
-        return select(x > y, x, y);
-    }
-
-    KFR_HANDLE_SCALAR(min)
-    KFR_HANDLE_SCALAR(max)
-    KFR_SPEC_FN(in_min_max, min)
-    KFR_SPEC_FN(in_min_max, max)
-};
-
-#ifdef CID_ARCH_X86
-
-template <cpu_t cc>
-struct in_min_max<cpu_t::sse2, cc> : in_select<cc>
-{
-    constexpr static cpu_t cpu = cpu_t::sse2;
-
-private:
-    using in_select<cc>::select;
-
-public:
-    template <typename T>
-    KFR_SINTRIN T min(initialvalue<T>)
-    {
-        return std::numeric_limits<T>::max();
-    }
-    template <typename T>
-    KFR_SINTRIN T max(initialvalue<T>)
-    {
-        return std::numeric_limits<T>::min();
-    }
-
-    KFR_CPU_INTRIN(sse2) f32sse min(f32sse x, f32sse y) { return _mm_min_ps(*x, *y); }
-    KFR_CPU_INTRIN(sse2) f64sse min(f64sse x, f64sse y) { return _mm_min_pd(*x, *y); }
-    KFR_CPU_INTRIN(sse2) i8sse min(i8sse x, i8sse y) { return select(x < y, x, y); }
-    KFR_CPU_INTRIN(sse2) u16sse min(u16sse x, u16sse y) { return select(x < y, x, y); }
-    KFR_CPU_INTRIN(sse2) i32sse min(i32sse x, i32sse y) { return select(x < y, x, y); }
-    KFR_CPU_INTRIN(sse2) u32sse min(u32sse x, u32sse y) { return select(x < y, x, y); }
-    KFR_CPU_INTRIN(sse2) u8sse min(u8sse x, u8sse y) { return _mm_min_epu8(*x, *y); }
-    KFR_CPU_INTRIN(sse2) i16sse min(i16sse x, i16sse y) { return _mm_min_epi16(*x, *y); }
-    KFR_CPU_INTRIN(sse2) i64sse min(i64sse x, i64sse y) { return select(x < y, x, y); }
-    KFR_CPU_INTRIN(sse2) u64sse min(u64sse x, u64sse y) { return select(x < y, x, y); }
-
-    KFR_CPU_INTRIN(sse2) f32sse max(f32sse x, f32sse y) { return _mm_max_ps(*x, *y); }
-    KFR_CPU_INTRIN(sse2) f64sse max(f64sse x, f64sse y) { return _mm_max_pd(*x, *y); }
-    KFR_CPU_INTRIN(sse2) i8sse max(i8sse x, i8sse y) { return select(x > y, x, y); }
-    KFR_CPU_INTRIN(sse2) u16sse max(u16sse x, u16sse y) { return select(x > y, x, y); }
-    KFR_CPU_INTRIN(sse2) i32sse max(i32sse x, i32sse y) { return select(x > y, x, y); }
-    KFR_CPU_INTRIN(sse2) u32sse max(u32sse x, u32sse y) { return select(x > y, x, y); }
-    KFR_CPU_INTRIN(sse2) u8sse max(u8sse x, u8sse y) { return _mm_max_epu8(*x, *y); }
-    KFR_CPU_INTRIN(sse2) i16sse max(i16sse x, i16sse y) { return _mm_max_epi16(*x, *y); }
-    KFR_CPU_INTRIN(sse2) i64sse max(i64sse x, i64sse y) { return select(x > y, x, y); }
-    KFR_CPU_INTRIN(sse2) u64sse max(u64sse x, u64sse y) { return select(x > y, x, y); }
-
-    KFR_HANDLE_ALL(min)
-    KFR_HANDLE_ALL(max)
-    KFR_HANDLE_SCALAR(min)
-    KFR_HANDLE_SCALAR(max)
-    KFR_SPEC_FN(in_min_max, min)
-    KFR_SPEC_FN(in_min_max, max)
-};
-
-template <cpu_t cc>
-struct in_min_max<cpu_t::sse41, cc> : in_min_max<cpu_t::sse2>
-{
-    constexpr static cpu_t cpu = cpu_t::sse41;
-    using in_min_max<cpu_t::sse2>::min;
-    using in_min_max<cpu_t::sse2>::max;
-
-    KFR_CPU_INTRIN(sse41) i8sse min(i8sse x, i8sse y) { return _mm_min_epi8(*x, *y); }
-    KFR_CPU_INTRIN(sse41) u16sse min(u16sse x, u16sse y) { return _mm_min_epu16(*x, *y); }
-    KFR_CPU_INTRIN(sse41) i32sse min(i32sse x, i32sse y) { return _mm_min_epi32(*x, *y); }
-    KFR_CPU_INTRIN(sse41) u32sse min(u32sse x, u32sse y) { return _mm_min_epu32(*x, *y); }
-
-    KFR_CPU_INTRIN(sse41) i8sse max(i8sse x, i8sse y) { return _mm_max_epi8(*x, *y); }
-    KFR_CPU_INTRIN(sse41) u16sse max(u16sse x, u16sse y) { return _mm_max_epu16(*x, *y); }
-    KFR_CPU_INTRIN(sse41) i32sse max(i32sse x, i32sse y) { return _mm_max_epi32(*x, *y); }
-    KFR_CPU_INTRIN(sse41) u32sse max(u32sse x, u32sse y) { return _mm_max_epu32(*x, *y); }
-
-    KFR_HANDLE_ALL(min)
-    KFR_HANDLE_ALL(max)
-    KFR_HANDLE_SCALAR(min)
-    KFR_HANDLE_SCALAR(max)
-    KFR_SPEC_FN(in_min_max, min)
-    KFR_SPEC_FN(in_min_max, max)
-};
-
-template <cpu_t cc>
-struct in_min_max<cpu_t::avx1, cc> : in_min_max<cpu_t::sse41>
-{
-    constexpr static cpu_t cpu = cpu_t::avx1;
-    using in_min_max<cpu_t::sse41>::min;
-    using in_min_max<cpu_t::sse41>::max;
-
-    KFR_CPU_INTRIN(avx) f32avx min(f32avx x, f32avx y) { return _mm256_min_ps(*x, *y); }
-    KFR_CPU_INTRIN(avx) f64avx min(f64avx x, f64avx y) { return _mm256_min_pd(*x, *y); }
-    KFR_CPU_INTRIN(avx) f32avx max(f32avx x, f32avx y) { return _mm256_max_ps(*x, *y); }
-    KFR_CPU_INTRIN(avx) f64avx max(f64avx x, f64avx y) { return _mm256_max_pd(*x, *y); }
-
-    KFR_HANDLE_ALL(min)
-    KFR_HANDLE_ALL(max)
-    KFR_HANDLE_SCALAR(min)
-    KFR_HANDLE_SCALAR(max)
-    KFR_SPEC_FN(in_min_max, min)
-    KFR_SPEC_FN(in_min_max, max)
-};
-
-template <cpu_t cc>
-struct in_min_max<cpu_t::avx2, cc> : in_min_max<cpu_t::avx1>, in_select<cpu_t::avx2>
-{
-    constexpr static cpu_t cpu = cpu_t::avx2;
-
-private:
-    using in_select<cpu>::select;
-
-public:
-    using in_min_max<cpu_t::avx1>::min;
-    using in_min_max<cpu_t::avx1>::max;
-
-    KFR_CPU_INTRIN(avx2) u8avx min(u8avx x, u8avx y) { return _mm256_min_epu8(*x, *y); }
-    KFR_CPU_INTRIN(avx2) i16avx min(i16avx x, i16avx y) { return _mm256_min_epi16(*x, *y); }
-    KFR_CPU_INTRIN(avx2) i8avx min(i8avx x, i8avx y) { return _mm256_min_epi8(*x, *y); }
-    KFR_CPU_INTRIN(avx2) u16avx min(u16avx x, u16avx y) { return _mm256_min_epu16(*x, *y); }
-    KFR_CPU_INTRIN(avx2) i32avx min(i32avx x, i32avx y) { return _mm256_min_epi32(*x, *y); }
-    KFR_CPU_INTRIN(avx2) u32avx min(u32avx x, u32avx y) { return _mm256_min_epu32(*x, *y); }
-
-    KFR_CPU_INTRIN(avx2) u8avx max(u8avx x, u8avx y) { return _mm256_max_epu8(*x, *y); }
-    KFR_CPU_INTRIN(avx2) i16avx max(i16avx x, i16avx y) { return _mm256_max_epi16(*x, *y); }
-    KFR_CPU_INTRIN(avx2) i8avx max(i8avx x, i8avx y) { return _mm256_max_epi8(*x, *y); }
-    KFR_CPU_INTRIN(avx2) u16avx max(u16avx x, u16avx y) { return _mm256_max_epu16(*x, *y); }
-    KFR_CPU_INTRIN(avx2) i32avx max(i32avx x, i32avx y) { return _mm256_max_epi32(*x, *y); }
-    KFR_CPU_INTRIN(avx2) u32avx max(u32avx x, u32avx y) { return _mm256_max_epu32(*x, *y); }
+#if defined CID_ARCH_SSE2
+
+KFR_SINTRIN f32sse min(f32sse x, f32sse y) { return _mm_min_ps(*x, *y); }
+KFR_SINTRIN f64sse min(f64sse x, f64sse y) { return _mm_min_pd(*x, *y); }
+KFR_SINTRIN u8sse min(u8sse x, u8sse y) { return _mm_min_epu8(*x, *y); }
+KFR_SINTRIN i16sse min(i16sse x, i16sse y) { return _mm_min_epi16(*x, *y); }
+KFR_SINTRIN i64sse min(i64sse x, i64sse y) { return select(x < y, x, y); }
+KFR_SINTRIN u64sse min(u64sse x, u64sse y) { return select(x < y, x, y); }
+
+KFR_SINTRIN f32sse max(f32sse x, f32sse y) { return _mm_max_ps(*x, *y); }
+KFR_SINTRIN f64sse max(f64sse x, f64sse y) { return _mm_max_pd(*x, *y); }
+KFR_SINTRIN u8sse max(u8sse x, u8sse y) { return _mm_max_epu8(*x, *y); }
+KFR_SINTRIN i16sse max(i16sse x, i16sse y) { return _mm_max_epi16(*x, *y); }
+KFR_SINTRIN i64sse max(i64sse x, i64sse y) { return select(x > y, x, y); }
+KFR_SINTRIN u64sse max(u64sse x, u64sse y) { return select(x > y, x, y); }
+
+#if defined CID_ARCH_AVX2
+KFR_SINTRIN u8avx min(u8avx x, u8avx y) { return _mm256_min_epu8(*x, *y); }
+KFR_SINTRIN i16avx min(i16avx x, i16avx y) { return _mm256_min_epi16(*x, *y); }
+KFR_SINTRIN i8avx min(i8avx x, i8avx y) { return _mm256_min_epi8(*x, *y); }
+KFR_SINTRIN u16avx min(u16avx x, u16avx y) { return _mm256_min_epu16(*x, *y); }
+KFR_SINTRIN i32avx min(i32avx x, i32avx y) { return _mm256_min_epi32(*x, *y); }
+KFR_SINTRIN u32avx min(u32avx x, u32avx y) { return _mm256_min_epu32(*x, *y); }
+
+KFR_SINTRIN u8avx max(u8avx x, u8avx y) { return _mm256_max_epu8(*x, *y); }
+KFR_SINTRIN i16avx max(i16avx x, i16avx y) { return _mm256_max_epi16(*x, *y); }
+KFR_SINTRIN i8avx max(i8avx x, i8avx y) { return _mm256_max_epi8(*x, *y); }
+KFR_SINTRIN u16avx max(u16avx x, u16avx y) { return _mm256_max_epu16(*x, *y); }
+KFR_SINTRIN i32avx max(i32avx x, i32avx y) { return _mm256_max_epi32(*x, *y); }
+KFR_SINTRIN u32avx max(u32avx x, u32avx y) { return _mm256_max_epu32(*x, *y); }
+
+KFR_SINTRIN i64avx min(i64avx x, i64avx y) { return select(x < y, x, y); }
+KFR_SINTRIN u64avx min(u64avx x, u64avx y) { return select(x < y, x, y); }
+KFR_SINTRIN i64avx max(i64avx x, i64avx y) { return select(x > y, x, y); }
+KFR_SINTRIN u64avx max(u64avx x, u64avx y) { return select(x > y, x, y); }
+#endif
 
-    KFR_CPU_INTRIN(avx2) i64avx min(i64avx x, i64avx y) { return select(x < y, x, y); }
-    KFR_CPU_INTRIN(avx2) u64avx min(u64avx x, u64avx y) { return select(x < y, x, y); }
-    KFR_CPU_INTRIN(avx2) i64avx max(i64avx x, i64avx y) { return select(x > y, x, y); }
-    KFR_CPU_INTRIN(avx2) u64avx max(u64avx x, u64avx y) { return select(x > y, x, y); }
+#if defined CID_ARCH_AVX
+KFR_SINTRIN f32avx min(f32avx x, f32avx y) { return _mm256_min_ps(*x, *y); }
+KFR_SINTRIN f64avx min(f64avx x, f64avx y) { return _mm256_min_pd(*x, *y); }
+KFR_SINTRIN f32avx max(f32avx x, f32avx y) { return _mm256_max_ps(*x, *y); }
+KFR_SINTRIN f64avx max(f64avx x, f64avx y) { return _mm256_max_pd(*x, *y); }
+#endif
 
-    KFR_HANDLE_ALL(min)
-    KFR_HANDLE_ALL(max)
-    KFR_HANDLE_SCALAR(min)
-    KFR_HANDLE_SCALAR(max)
-    KFR_SPEC_FN(in_min_max, min)
-    KFR_SPEC_FN(in_min_max, max)
-};
+#if defined CID_ARCH_SSE41
+KFR_SINTRIN i8sse min(i8sse x, i8sse y) { return _mm_min_epi8(*x, *y); }
+KFR_SINTRIN u16sse min(u16sse x, u16sse y) { return _mm_min_epu16(*x, *y); }
+KFR_SINTRIN i32sse min(i32sse x, i32sse y) { return _mm_min_epi32(*x, *y); }
+KFR_SINTRIN u32sse min(u32sse x, u32sse y) { return _mm_min_epu32(*x, *y); }
+
+KFR_SINTRIN i8sse max(i8sse x, i8sse y) { return _mm_max_epi8(*x, *y); }
+KFR_SINTRIN u16sse max(u16sse x, u16sse y) { return _mm_max_epu16(*x, *y); }
+KFR_SINTRIN i32sse max(i32sse x, i32sse y) { return _mm_max_epi32(*x, *y); }
+KFR_SINTRIN u32sse max(u32sse x, u32sse y) { return _mm_max_epu32(*x, *y); }
+#else
+KFR_SINTRIN i8sse min(i8sse x, i8sse y) { return select(x < y, x, y); }
+KFR_SINTRIN u16sse min(u16sse x, u16sse y) { return select(x < y, x, y); }
+KFR_SINTRIN i32sse min(i32sse x, i32sse y) { return select(x < y, x, y); }
+KFR_SINTRIN u32sse min(u32sse x, u32sse y) { return select(x < y, x, y); }
+
+KFR_SINTRIN i8sse max(i8sse x, i8sse y) { return select(x > y, x, y); }
+KFR_SINTRIN u16sse max(u16sse x, u16sse y) { return select(x > y, x, y); }
+KFR_SINTRIN i32sse max(i32sse x, i32sse y) { return select(x > y, x, y); }
+KFR_SINTRIN u32sse max(u32sse x, u32sse y) { return select(x > y, x, y); }
 
 #endif
 
-template <cpu_t cpu = cpu_t::native>
-struct in_minabs_maxabs
-{
-public:
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> minabs(vec<T, N> x, vec<T, N> y)
-    {
-        return in_min_max<cpu>::min(in_abs<cpu>::abs(x), in_abs<cpu>::abs(y));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> maxabs(vec<T, N> x, vec<T, N> y)
-    {
-        return in_min_max<cpu>::max(in_abs<cpu>::abs(x), in_abs<cpu>::abs(y));
-    }
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
 
-    KFR_HANDLE_ALL(minabs)
-    KFR_HANDLE_ALL(maxabs)
-    KFR_HANDLE_SCALAR(min)
-    KFR_HANDLE_SCALAR(max)
-    KFR_SPEC_FN(in_minabs_maxabs, minabs)
-    KFR_SPEC_FN(in_minabs_maxabs, maxabs)
-};
+#else
 
-template <cpu_t cpu = cpu_t::native>
-struct in_clamp : in_min_max<cpu>
+// fallback
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> min(vec<T, N> x, vec<T, N> y)
 {
-    using in_min_max<cpu>::min;
-    using in_min_max<cpu>::max;
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, T minimum, T maximum)
-    {
-        return clamp(x, broadcast<N>(minimum), broadcast<N>(maximum));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, T minimum, vec<T, N> maximum)
-    {
-        return clamp(x, broadcast<N>(minimum), maximum);
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> minimum, T maximum)
-    {
-        return clamp(x, minimum, broadcast<N>(maximum));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, T maximum)
-    {
-        return clamp(x, broadcast<N>(maximum));
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> minimum, vec<T, N> maximum)
-    {
-        return max(minimum, min(x, maximum));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> maximum)
-    {
-        return max(zerovector<T, N>(), min(x, maximum));
-    }
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clampm1(vec<T, N> x, vec<T, N> minimum, vec<T, N> maximum)
-    {
-        return max(minimum, min(x, maximum - T(1)));
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> clampm1(vec<T, N> x, vec<T, N> maximum)
-    {
-        return max(zerovector<T, N>(), min(x, maximum - T(1)));
-    }
-    KFR_HANDLE_ALL(clamp)
-    KFR_HANDLE_ALL(clampm1)
-    KFR_HANDLE_SCALAR(min)
-    KFR_HANDLE_SCALAR(max)
-    KFR_SPEC_FN(in_clamp, clamp)
-    KFR_SPEC_FN(in_clamp, clampm1)
-};
+    return select(x < y, x, y);
 }
-
-namespace native
-{
-using fn_min = internal::in_min_max<>::fn_min;
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> min(const T1& x, const T2& y)
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> max(vec<T, N> x, vec<T, N> y)
 {
-    return internal::in_min_max<>::min(x, y);
+    return select(x > y, x, y);
 }
+#endif
 
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_min, E1, E2> min(E1&& x, E2&& y)
+template <typename T>
+KFR_SINTRIN T min(initialvalue<T>)
 {
-    return { fn_min(), std::forward<E1>(x), std::forward<E2>(y) };
+    return std::numeric_limits<T>::max();
 }
-using fn_max = internal::in_min_max<>::fn_max;
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> max(const T1& x, const T2& y)
+template <typename T>
+KFR_SINTRIN T max(initialvalue<T>)
 {
-    return internal::in_min_max<>::max(x, y);
+    return std::numeric_limits<T>::min();
 }
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_max, E1, E2> max(E1&& x, E2&& y)
+template <typename T>
+KFR_SINTRIN T absmin(initialvalue<T>)
 {
-    return { fn_max(), std::forward<E1>(x), std::forward<E2>(y) };
+    return std::numeric_limits<T>::max();
 }
-using fn_minabs = internal::in_minabs_maxabs<>::fn_minabs;
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> minabs(const T1& x, const T2& y)
+template <typename T>
+KFR_SINTRIN T absmax(initialvalue<T>)
 {
-    return internal::in_minabs_maxabs<>::minabs(x, y);
+    return 0;
 }
 
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_minabs, E1, E2> minabs(E1&& x, E2&& y)
-{
-    return { fn_minabs(), std::forward<E1>(x), std::forward<E2>(y) };
+KFR_HANDLE_SCALAR_2(min)
+KFR_FN(min)
+KFR_HANDLE_SCALAR_2(max)
+KFR_FN(max)
+KFR_HANDLE_SCALAR_2(absmin)
+KFR_FN(absmin)
+KFR_HANDLE_SCALAR_2(absmax)
+KFR_FN(absmax)
 }
-using fn_maxabs = internal::in_minabs_maxabs<>::fn_maxabs;
+
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> maxabs(const T1& x, const T2& y)
+KFR_INTRIN common_type<T1, T2> min(const T1& x, const T2& y)
 {
-    return internal::in_minabs_maxabs<>::maxabs(x, y);
+    return internal::min(x, y);
 }
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_maxabs, E1, E2> maxabs(E1&& x, E2&& y)
-{
-    return { fn_maxabs(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-using fn_clamp = internal::in_clamp<>::fn_clamp;
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INLINE ftype<common_type<T1, T2, T3>> clamp(const T1& x, const T2& l, const T3& h)
+KFR_INTRIN expr_func<internal::fn_min, E1, E2> min(E1&& x, E2&& y)
 {
-    return internal::in_clamp<>::clamp(x, l, h);
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
 
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INLINE expr_func<fn_clamp, E1, E2, E3> clamp(E1&& x, E2&& l, E3&& h)
-{
-    return { fn_clamp(), std::forward<E1>(x), std::forward<E2>(l), std::forward<E3>(h) };
-}
-using fn_clampm1 = internal::in_clamp<>::fn_clampm1;
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INLINE ftype<common_type<T1, T2, T3>> clampm1(const T1& x, const T2& l, const T3& h)
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_INTRIN common_type<T1, T2> max(const T1& x, const T2& y)
 {
-    return internal::in_clamp<>::clampm1(x, l, h);
+    return internal::max(x, y);
 }
 
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INLINE expr_func<fn_clampm1, E1, E2, E3> clampm1(E1&& x, E2&& l, E3&& h)
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRIN expr_func<internal::fn_max, E1, E2> max(E1&& x, E2&& y)
 {
-    return { fn_clampm1(), std::forward<E1>(x), std::forward<E2>(l), std::forward<E3>(h) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
 
-using fn_clamp = internal::in_clamp<>::fn_clamp;
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> clamp(const T1& x, const T2& h)
+KFR_INTRIN common_type<T1, T2> absmin(const T1& x, const T2& y)
 {
-    return internal::in_clamp<>::clamp(x, h);
+    return internal::absmin(x, y);
 }
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_clamp, E1, E2> clamp(E1&& x, E2&& h)
+KFR_INTRIN expr_func<internal::fn_absmin, E1, E2> absmin(E1&& x, E2&& y)
 {
-    return { fn_clamp(), std::forward<E1>(x), std::forward<E2>(h) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
-using fn_clampm1 = internal::in_clamp<>::fn_clampm1;
+
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> clampm1(const T1& x, const T2& h)
+KFR_INTRIN common_type<T1, T2> absmax(const T1& x, const T2& y)
 {
-    return internal::in_clamp<>::clampm1(x, h);
+    return internal::absmax(x, y);
 }
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_clampm1, E1, E2> clampm1(E1&& x, E2&& h)
+KFR_INTRIN expr_func<internal::fn_absmax, E1, E2> absmax(E1&& x, E2&& y)
 {
-    return { fn_clampm1(), std::forward<E1>(x), std::forward<E2>(h) };
-}
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
 }
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp
@@ -0,0 +1,113 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+#include "function.hpp"
+#include "log_exp.hpp"
+
+#pragma clang diagnostic push
+#if CID_HAS_WARNING("-Wc99-extensions")
+#pragma clang diagnostic ignored "-Wc99-extensions"
+#endif
+
+namespace kfr
+{
+
+namespace internal
+{
+
+template <typename T>
+constexpr T bessel_coef[] = { T(0.25),
+                              T(0.027777777777777776236),
+                              T(0.0017361111111111110147),
+                              T(6.9444444444444444384e-005),
+                              T(1.9290123456790123911e-006),
+                              T(3.9367598891408417495e-008),
+                              T(6.1511873267825652335e-010),
+                              T(7.5940584281266239246e-012),
+                              T(7.5940584281266233693e-014),
+                              T(6.2760813455591932909e-016),
+                              T(4.3583898233049949985e-018),
+                              T(2.5789288895295827557e-020),
+                              T(1.3157800456783586208e-022),
+                              T(5.8479113141260384983e-025),
+                              T(2.2843403570804837884e-027),
+                              T(7.904291893012054025e-030),
+                              T(2.4395962632753252792e-032),
+                              T(6.75788438580422547e-035),
+                              T(1.689471096451056426e-037),
+                              T(3.8310002187098784929e-040),
+                              T(7.9152897080782616517e-043),
+                              T(1.4962740468957016443e-045),
+                              T(2.5976979980828152196e-048),
+                              T(4.1563167969325041577e-051),
+                              T(6.1483976285983795968e-054),
+                              T(8.434015951438105991e-057),
+                              T(1.0757673407446563809e-059),
+                              T(1.2791526049282476926e-062),
+                              T(1.4212806721424974034e-065),
+                              T(1.4789601166935457918e-068),
+                              T(1.4442969889585408123e-071),
+                              T(1.3262598613026086927e-074),
+                              T(1.1472836170437790782e-077),
+                              T(9.3655805472961564331e-081),
+                              T(7.2265282000741942594e-084),
+                              T(5.2786911614858977913e-087),
+                              T(3.6556032974279072401e-090),
+                              T(2.4034209713529963119e-093),
+                              T(1.5021381070956226783e-096) };
+
+template <typename T, size_t N>
+KFR_INLINE vec<T, N> modzerobessel(vec<T, N> x)
+{
+    const vec<T, N> x_2     = x * 0.5;
+    const vec<T, N> x_2_sqr = x_2 * x_2;
+    vec<T, N> num           = x_2_sqr;
+    vec<T, N> result;
+    result = 1 + x_2_sqr;
+
+    KFR_LOOP_UNROLL
+    for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
+    {
+        result = fmadd((num *= x_2_sqr), bessel_coef<T>[i], result);
+    }
+    return result;
+}
+
+KFR_HANDLE_SCALAR(modzerobessel)
+KFR_FN(modzerobessel)
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRIN T1 modzerobessel(const T1& x)
+{
+    return internal::modzerobessel(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRIN expr_func<internal::fn_modzerobessel, E1> modzerobessel(E1&& x)
+{
+    return { {}, std::forward<E1>(x) };
+}
+}
+
+#pragma clang diagnostic pop
diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp
@@ -660,4 +660,40 @@ KFR_INLINE vec<T, N> negodd(const vec<T, N>& x)
 {
     return x ^ broadcast<N>(T(), -T());
 }
+
+
+#define KFR_EXPR_UNARY(fn, op)                                                                               \
+    template <typename A1, KFR_ENABLE_IF(is_input_expression<A1>::value)>                                    \
+    KFR_INLINE auto operator op(A1&& a1)->decltype(bind_expression(fn(), std::forward<A1>(a1)))              \
+    {                                                                                                        \
+        return bind_expression(fn(), std::forward<A1>(a1));                                                  \
+    }
+
+#define KFR_EXPR_BINARY(fn, op)                                                                              \
+    template <typename A1, typename A2, KFR_ENABLE_IF(is_input_expressions<A1, A2>::value)>                  \
+    KFR_INLINE auto operator op(A1&& a1, A2&& a2)                                                            \
+        ->decltype(bind_expression(fn(), std::forward<A1>(a1), std::forward<A2>(a2)))                        \
+    {                                                                                                        \
+        return bind_expression(fn(), std::forward<A1>(a1), std::forward<A2>(a2));                            \
+    }
+
+KFR_EXPR_UNARY(fn_neg, -)
+KFR_EXPR_UNARY(fn_bitwisenot, ~)
+
+KFR_EXPR_BINARY(fn_add, +)
+KFR_EXPR_BINARY(fn_sub, -)
+KFR_EXPR_BINARY(fn_mul, *)
+KFR_EXPR_BINARY(fn_div, /)
+KFR_EXPR_BINARY(fn_bitwiseand, &)
+KFR_EXPR_BINARY(fn_bitwiseor, |)
+KFR_EXPR_BINARY(fn_bitwisexor, ^)
+KFR_EXPR_BINARY(fn_shl, <<)
+KFR_EXPR_BINARY(fn_shr, >>)
+
+KFR_EXPR_BINARY(fn_equal, ==)
+KFR_EXPR_BINARY(fn_notequal, !=)
+KFR_EXPR_BINARY(fn_less, <)
+KFR_EXPR_BINARY(fn_greater, >)
+KFR_EXPR_BINARY(fn_lessorequal, <=)
+KFR_EXPR_BINARY(fn_greaterorequal, >=)
 }
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -28,6 +28,9 @@
 namespace kfr
 {
 
+namespace internal
+{
+
 #define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TRUNC)
 #define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_NINT)
 #define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TRUNC)
@@ -48,285 +51,211 @@ namespace kfr
 #define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TRUNC)
 #define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_NINT)
 
-namespace internal
-{
+#if defined CID_ARCH_SSE41
 
-template <cpu_t c = cpu_t::native>
-struct in_round : in_round<older(c)>
-{
-    struct fn_floor : in_round<older(c)>::fn_floor, fn_disabled
-    {
-    };
-    struct fn_ceil : in_round<older(c)>::fn_ceil, fn_disabled
-    {
-    };
-    struct fn_round : in_round<older(c)>::fn_round, fn_disabled
-    {
-    };
-    struct fn_trunc : in_round<older(c)>::fn_trunc, fn_disabled
-    {
-    };
-    struct fn_fract : in_round<older(c)>::fn_fract, fn_disabled
-    {
-    };
-};
+KFR_SINTRIN f32sse floor(f32sse value) { return _mm_floor_ps(*value); }
+KFR_SINTRIN f32sse ceil(f32sse value) { return _mm_ceil_ps(*value); }
+KFR_SINTRIN f32sse trunc(f32sse value) { return KFR_mm_trunc_ps(*value); }
+KFR_SINTRIN f32sse round(f32sse value) { return KFR_mm_roundnearest_ps(*value); }
+KFR_SINTRIN f64sse floor(f64sse value) { return _mm_floor_pd(*value); }
+KFR_SINTRIN f64sse ceil(f64sse value) { return _mm_ceil_pd(*value); }
+KFR_SINTRIN f64sse trunc(f64sse value) { return KFR_mm_trunc_pd(*value); }
+KFR_SINTRIN f64sse round(f64sse value) { return KFR_mm_roundnearest_pd(*value); }
+KFR_SINTRIN f32sse fract(f32sse x) { return x - floor(x); }
+KFR_SINTRIN f64sse fract(f64sse x) { return x - floor(x); }
 
-template <>
-struct in_round<cpu_t::common>
-{
-    constexpr static cpu_t cpu = cpu_t::common;
+#if defined CID_ARCH_AVX
 
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> floor(vec<T, N> value)
-    {
-        return value;
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> ceil(vec<T, N> value)
-    {
-        return value;
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> trunc(vec<T, N> value)
-    {
-        return value;
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> round(vec<T, N> value)
-    {
-        return value;
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> fract(vec<T, N>)
-    {
-        return T();
-    }
+KFR_SINTRIN f32avx floor(f32avx value) { return _mm256_floor_ps(*value); }
+KFR_SINTRIN f32avx ceil(f32avx value) { return _mm256_ceil_ps(*value); }
+KFR_SINTRIN f32avx trunc(f32avx value) { return KFR_mm256_trunc_ps(*value); }
+KFR_SINTRIN f32avx round(f32avx value) { return KFR_mm256_roundnearest_ps(*value); }
+KFR_SINTRIN f64avx floor(f64avx value) { return _mm256_floor_pd(*value); }
+KFR_SINTRIN f64avx ceil(f64avx value) { return _mm256_ceil_pd(*value); }
+KFR_SINTRIN f64avx trunc(f64avx value) { return KFR_mm256_trunc_pd(*value); }
+KFR_SINTRIN f64avx round(f64avx value) { return KFR_mm256_roundnearest_pd(*value); }
+KFR_SINTRIN f32avx fract(f32avx x) { return x - floor(x); }
+KFR_SINTRIN f64avx fract(f64avx x) { return x - floor(x); }
+#endif
 
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> floor(vec<f32, N> x)
-    {
-        vec<f32, N> t = cast<f32>(cast<i32>(x));
-        return t - (bitcast<f32>(x < t) & 1.f);
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> floor(vec<f64, N> x)
-    {
-        vec<f64, N> t = cast<f64>(cast<i64>(x));
-        return t - (bitcast<f64>(x < t) & 1.0);
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> ceil(vec<f32, N> x)
-    {
-        vec<f32, N> t = cast<f32>(cast<i32>(x));
-        return t + (bitcast<f32>(x > t) & 1.f);
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> ceil(vec<f64, N> x)
-    {
-        vec<f64, N> t = cast<f64>(cast<i64>(x));
-        return t + (bitcast<f64>(x > t) & 1.0);
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> round(vec<f32, N> x)
-    {
-        return cast<f32>(cast<i32>(x + mulsign(broadcast<N>(0.5f), x)));
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> round(vec<f64, N> x)
-    {
-        return cast<f64>(cast<i64>(x + mulsign(broadcast<N>(0.5), x)));
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> trunc(vec<f32, N> x)
-    {
-        return cast<f32>(cast<i32>(x));
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> trunc(vec<f64, N> x)
-    {
-        return cast<f64>(cast<i64>(x));
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> fract(vec<f32, N> x)
-    {
-        return x - floor(x);
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> fract(vec<f64, N> x)
-    {
-        return x - floor(x);
-    }
+KFR_HANDLE_ALL_SIZES_1(floor)
+KFR_HANDLE_ALL_SIZES_1(ceil)
+KFR_HANDLE_ALL_SIZES_1(round)
+KFR_HANDLE_ALL_SIZES_1(trunc)
+KFR_HANDLE_ALL_SIZES_1(fract)
 
-    KFR_HANDLE_SCALAR(floor)
-    KFR_HANDLE_SCALAR(ceil)
-    KFR_HANDLE_SCALAR(round)
-    KFR_HANDLE_SCALAR(trunc)
-    KFR_HANDLE_SCALAR(fract)
-    KFR_SPEC_FN(in_round, floor)
-    KFR_SPEC_FN(in_round, ceil)
-    KFR_SPEC_FN(in_round, round)
-    KFR_SPEC_FN(in_round, trunc)
-    KFR_SPEC_FN(in_round, fract)
-};
+#else
 
-#ifdef CID_ARCH_X86
+// fallback
 
-template <>
-struct in_round<cpu_t::sse41> : in_round<cpu_t::common>
+template <size_t N>
+KFR_SINTRIN vec<f32, N> floor(vec<f32, N> x)
 {
-    constexpr static cpu_t cpu = cpu_t::sse41;
-
-    KFR_SINTRIN f32sse floor(f32sse value) { return _mm_floor_ps(*value); }
-    KFR_SINTRIN f32sse ceil(f32sse value) { return _mm_ceil_ps(*value); }
-    KFR_SINTRIN f32sse trunc(f32sse value) { return KFR_mm_trunc_ps(*value); }
-    KFR_SINTRIN f32sse round(f32sse value) { return KFR_mm_roundnearest_ps(*value); }
-    KFR_SINTRIN f64sse floor(f64sse value) { return _mm_floor_pd(*value); }
-    KFR_SINTRIN f64sse ceil(f64sse value) { return _mm_ceil_pd(*value); }
-    KFR_SINTRIN f64sse trunc(f64sse value) { return KFR_mm_trunc_pd(*value); }
-    KFR_SINTRIN f64sse round(f64sse value) { return KFR_mm_roundnearest_pd(*value); }
-    KFR_SINTRIN f32sse fract(f32sse x) { return x - floor(x); }
-    KFR_SINTRIN f64sse fract(f64sse x) { return x - floor(x); }
-
-    KFR_HANDLE_ALL(floor)
-    KFR_HANDLE_ALL(ceil)
-    KFR_HANDLE_ALL(round)
-    KFR_HANDLE_ALL(trunc)
-    KFR_HANDLE_ALL(fract)
-    KFR_HANDLE_SCALAR(floor)
-    KFR_HANDLE_SCALAR(ceil)
-    KFR_HANDLE_SCALAR(round)
-    KFR_HANDLE_SCALAR(trunc)
-    KFR_HANDLE_SCALAR(fract)
-    KFR_SPEC_FN(in_round, floor)
-    KFR_SPEC_FN(in_round, ceil)
-    KFR_SPEC_FN(in_round, round)
-    KFR_SPEC_FN(in_round, trunc)
-    KFR_SPEC_FN(in_round, fract)
-};
-
-template <>
-struct in_round<cpu_t::avx1> : in_round<cpu_t::sse41>
+    vec<f32, N> t = cast<f32>(cast<i32>(x));
+    return t - (bitcast<f32>(x < t) & 1.f);
+}
+template <size_t N>
+KFR_SINTRIN vec<f64, N> floor(vec<f64, N> x)
 {
-    constexpr static cpu_t cpu = cpu_t::avx1;
-    using in_round<cpu_t::sse41>::floor;
-    using in_round<cpu_t::sse41>::ceil;
-    using in_round<cpu_t::sse41>::trunc;
-    using in_round<cpu_t::sse41>::round;
-    using in_round<cpu_t::sse41>::fract;
-
-    KFR_SINTRIN f32avx floor(f32avx value) { return _mm256_floor_ps(*value); }
-    KFR_SINTRIN f32avx ceil(f32avx value) { return _mm256_ceil_ps(*value); }
-    KFR_SINTRIN f32avx trunc(f32avx value) { return KFR_mm256_trunc_ps(*value); }
-    KFR_SINTRIN f32avx round(f32avx value) { return KFR_mm256_roundnearest_ps(*value); }
-    KFR_SINTRIN f64avx floor(f64avx value) { return _mm256_floor_pd(*value); }
-    KFR_SINTRIN f64avx ceil(f64avx value) { return _mm256_ceil_pd(*value); }
-    KFR_SINTRIN f64avx trunc(f64avx value) { return KFR_mm256_trunc_pd(*value); }
-    KFR_SINTRIN f64avx round(f64avx value) { return KFR_mm256_roundnearest_pd(*value); }
-    KFR_SINTRIN f32avx fract(f32avx x) { return x - floor(x); }
-    KFR_SINTRIN f64avx fract(f64avx x) { return x - floor(x); }
-
-    KFR_HANDLE_ALL(floor)
-    KFR_HANDLE_ALL(ceil)
-    KFR_HANDLE_ALL(round)
-    KFR_HANDLE_ALL(trunc)
-    KFR_HANDLE_ALL(fract)
-    KFR_HANDLE_SCALAR(floor)
-    KFR_HANDLE_SCALAR(ceil)
-    KFR_HANDLE_SCALAR(round)
-    KFR_HANDLE_SCALAR(trunc)
-    KFR_HANDLE_SCALAR(fract)
-    KFR_SPEC_FN(in_round, floor)
-    KFR_SPEC_FN(in_round, ceil)
-    KFR_SPEC_FN(in_round, round)
-    KFR_SPEC_FN(in_round, trunc)
-    KFR_SPEC_FN(in_round, fract)
-};
+    vec<f64, N> t = cast<f64>(cast<i64>(x));
+    return t - (bitcast<f64>(x < t) & 1.0);
+}
+template <size_t N>
+KFR_SINTRIN vec<f32, N> ceil(vec<f32, N> x)
+{
+    vec<f32, N> t = cast<f32>(cast<i32>(x));
+    return t + (bitcast<f32>(x > t) & 1.f);
+}
+template <size_t N>
+KFR_SINTRIN vec<f64, N> ceil(vec<f64, N> x)
+{
+    vec<f64, N> t = cast<f64>(cast<i64>(x));
+    return t + (bitcast<f64>(x > t) & 1.0);
+}
+template <size_t N>
+KFR_SINTRIN vec<f32, N> round(vec<f32, N> x)
+{
+    return cast<f32>(cast<i32>(x + mulsign(broadcast<N>(0.5f), x)));
+}
+template <size_t N>
+KFR_SINTRIN vec<f64, N> round(vec<f64, N> x)
+{
+    return cast<f64>(cast<i64>(x + mulsign(broadcast<N>(0.5), x)));
+}
+template <size_t N>
+KFR_SINTRIN vec<f32, N> trunc(vec<f32, N> x)
+{
+    return cast<f32>(cast<i32>(x));
+}
+template <size_t N>
+KFR_SINTRIN vec<f64, N> trunc(vec<f64, N> x)
+{
+    return cast<f64>(cast<i64>(x));
+}
+template <size_t N>
+KFR_SINTRIN vec<f32, N> fract(vec<f32, N> x)
+{
+    return x - floor(x);
+}
+template <size_t N>
+KFR_SINTRIN vec<f64, N> fract(vec<f64, N> x)
+{
+    return x - floor(x);
+}
+#endif
 
-#undef KFR_mm_trunc_ps
-#undef KFR_mm_roundnearest_ps
-#undef KFR_mm_trunc_pd
-#undef KFR_mm_roundnearest_pd
-#undef KFR_mm_trunc_ss
-#undef KFR_mm_roundnearest_ss
-#undef KFR_mm_trunc_sd
-#undef KFR_mm_roundnearest_sd
-#undef KFR_mm_floor_ss
-#undef KFR_mm_floor_sd
-#undef KFR_mm_ceil_ss
-#undef KFR_mm_ceil_sd
-#undef KFR_mm256_trunc_ps
-#undef KFR_mm256_roundnearest_ps
-#undef KFR_mm256_trunc_pd
-#undef KFR_mm256_roundnearest_pd
+// template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+// KFR_SINTRIN vec<T, N> floor(vec<T, N> value)
+//{
+//    return value;
+//}
+// template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+// KFR_SINTRIN vec<T, N> ceil(vec<T, N> value)
+//{
+//    return value;
+//}
+// template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+// KFR_SINTRIN vec<T, N> trunc(vec<T, N> value)
+//{
+//    return value;
+//}
+// template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+// KFR_SINTRIN vec<T, N> round(vec<T, N> value)
+//{
+//    return value;
+//}
+// template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+// KFR_SINTRIN vec<T, N> fract(vec<T, N>)
+//{
+//    return T(0);
+//}
 
-#endif
+KFR_HANDLE_SCALAR_1(floor)
+KFR_HANDLE_SCALAR_1(ceil)
+KFR_HANDLE_SCALAR_1(round)
+KFR_HANDLE_SCALAR_1(trunc)
+KFR_HANDLE_SCALAR_1(fract)
+KFR_FN(floor)
+KFR_FN(ceil)
+KFR_FN(round)
+KFR_FN(trunc)
+KFR_FN(fract)
 }
 
-namespace native
-{
-using fn_floor = internal::in_round<>::fn_floor;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
 KFR_INTRIN ftype<T1> floor(const T1& x)
 {
-    return internal::in_round<>::floor(x);
+    return internal::floor(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_floor, E1> floor(E1&& x)
+KFR_INTRIN expr_func<internal::fn_floor, E1> floor(E1&& x)
 {
-    return { fn_floor(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_ceil = internal::in_round<>::fn_ceil;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
 KFR_INTRIN ftype<T1> ceil(const T1& x)
 {
-    return internal::in_round<>::ceil(x);
+    return internal::ceil(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_ceil, E1> ceil(E1&& x)
+KFR_INTRIN expr_func<internal::fn_ceil, E1> ceil(E1&& x)
 {
-    return { fn_ceil(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_round = internal::in_round<>::fn_round;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
 KFR_INTRIN ftype<T1> round(const T1& x)
 {
-    return internal::in_round<>::round(x);
+    return internal::round(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_round, E1> round(E1&& x)
+KFR_INTRIN expr_func<internal::fn_round, E1> round(E1&& x)
 {
-    return { fn_round(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_trunc = internal::in_round<>::fn_trunc;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
 KFR_INTRIN ftype<T1> trunc(const T1& x)
 {
-    return internal::in_round<>::trunc(x);
+    return internal::trunc(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_trunc, E1> trunc(E1&& x)
+KFR_INTRIN expr_func<internal::fn_trunc, E1> trunc(E1&& x)
 {
-    return { fn_trunc(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_fract = internal::in_round<>::fn_fract;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
 KFR_INTRIN ftype<T1> fract(const T1& x)
 {
-    return internal::in_round<>::fract(x);
+    return internal::fract(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_fract, E1> fract(E1&& x)
+KFR_INTRIN expr_func<internal::fn_fract, E1> fract(E1&& x)
 {
-    return { fn_fract(), std::forward<E1>(x) };
-}
+    return { {}, std::forward<E1>(x) };
 }
 }
+
+#undef KFR_mm_trunc_ps
+#undef KFR_mm_roundnearest_ps
+#undef KFR_mm_trunc_pd
+#undef KFR_mm_roundnearest_pd
+#undef KFR_mm_trunc_ss
+#undef KFR_mm_roundnearest_ss
+#undef KFR_mm_trunc_sd
+#undef KFR_mm_roundnearest_sd
+#undef KFR_mm_floor_ss
+#undef KFR_mm_floor_sd
+#undef KFR_mm_ceil_ss
+#undef KFR_mm_ceil_sd
+#undef KFR_mm256_trunc_ps
+#undef KFR_mm256_roundnearest_ps
+#undef KFR_mm256_trunc_pd
+#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -25,181 +25,125 @@
 #include "function.hpp"
 #include "select.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
 namespace internal
 {
-
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_saturated : in_saturated<older(c), cc>
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> saturated_signed_add(vec<T, N> a, vec<T, N> b)
 {
-    struct fn_satadd : in_saturated<older(c), cc>::fn_satadd, fn_disabled
-    {
-    };
-    struct fn_satsub : in_saturated<older(c), cc>::fn_satsub, fn_disabled
-    {
-    };
-};
-
-template <cpu_t cc>
-struct in_saturated<cpu_t::common, cc> : in_select<cc>
+    constexpr size_t shift = typebits<i32>::bits - 1;
+    const vec<T, N> sum = a + b;
+    a = (a >> shift) + allonesvector(a);
+
+    return select(((a ^ b) | ~(b ^ sum)) >= 0, a, sum);
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> saturated_signed_sub(vec<T, N> a, vec<T, N> b)
 {
-    constexpr static cpu_t cpu = cpu_t::common;
-
-    template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-    KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b)
-    {
-        return saturated_signed_add(a, b);
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-    KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b)
-    {
-        return saturated_unsigned_add(a, b);
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-    KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b)
-    {
-        return saturated_signed_sub(a, b);
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-    KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b)
-    {
-        return saturated_unsigned_sub(a, b);
-    }
-    KFR_SPEC_FN(in_saturated, satadd)
-    KFR_SPEC_FN(in_saturated, satsub)
-
-protected:
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> saturated_signed_add(vec<T, N> a, vec<T, N> b)
-    {
-        constexpr size_t shift = typebits<i32>::bits - 1;
-        const vec<T, N> sum = a + b;
-        a = (a >> shift) + allonesvector(a);
-
-        return select(((a ^ b) | ~(b ^ sum)) >= 0, a, sum);
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> saturated_signed_sub(vec<T, N> a, vec<T, N> b)
-    {
-        constexpr size_t shift = typebits<i32>::bits - 1;
-        const vec<T, N> diff = a - b;
-        a = (a >> shift) + allonesvector(a);
-
-        return select(((a ^ b) & (a ^ diff)) < 0, a, diff);
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> saturated_unsigned_add(vec<T, N> a, vec<T, N> b)
-    {
-        constexpr vec<T, N> t = allonesvector(a);
-        return select(a > t - b, t, a + b);
-    }
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> saturated_unsigned_sub(vec<T, N> a, vec<T, N> b)
-    {
-        return select(a < b, zerovector(a), a - b);
-    }
-};
-
-#ifdef CID_ARCH_X86
-
-template <cpu_t cc>
-struct in_saturated<cpu_t::sse2, cc> : in_saturated<cpu_t::common>, in_select<cc>
+    constexpr size_t shift = typebits<i32>::bits - 1;
+    const vec<T, N> diff = a - b;
+    a = (a >> shift) + allonesvector(a);
+
+    return select(((a ^ b) & (a ^ diff)) < 0, a, diff);
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> saturated_unsigned_add(vec<T, N> a, vec<T, N> b)
 {
-    constexpr static cpu_t cpu = cpu_t::sse2;
-
-private:
-    using in_select<cc>::select;
-
-public:
-    KFR_SINTRIN u8sse satadd(u8sse x, u8sse y) { return _mm_adds_epu8(*x, *y); }
-    KFR_SINTRIN i8sse satadd(i8sse x, i8sse y) { return _mm_adds_epi8(*x, *y); }
-    KFR_SINTRIN u16sse satadd(u16sse x, u16sse y) { return _mm_adds_epu16(*x, *y); }
-    KFR_SINTRIN i16sse satadd(i16sse x, i16sse y) { return _mm_adds_epi16(*x, *y); }
-
-    KFR_SINTRIN u8sse satsub(u8sse x, u8sse y) { return _mm_subs_epu8(*x, *y); }
-    KFR_SINTRIN i8sse satsub(i8sse x, i8sse y) { return _mm_subs_epi8(*x, *y); }
-    KFR_SINTRIN u16sse satsub(u16sse x, u16sse y) { return _mm_subs_epu16(*x, *y); }
-    KFR_SINTRIN i16sse satsub(i16sse x, i16sse y) { return _mm_subs_epi16(*x, *y); }
-
-    KFR_SINTRIN i32sse satadd(i32sse a, i32sse b) { return saturated_signed_add(a, b); }
-    KFR_SINTRIN i64sse satadd(i64sse a, i64sse b) { return saturated_signed_add(a, b); }
-    KFR_SINTRIN u32sse satadd(u32sse a, u32sse b) { return saturated_unsigned_add(a, b); }
-    KFR_SINTRIN u64sse satadd(u64sse a, u64sse b) { return saturated_unsigned_add(a, b); }
-
-    KFR_SINTRIN i32sse satsub(i32sse a, i32sse b) { return saturated_signed_sub(a, b); }
-    KFR_SINTRIN i64sse satsub(i64sse a, i64sse b) { return saturated_signed_sub(a, b); }
-    KFR_SINTRIN u32sse satsub(u32sse a, u32sse b) { return saturated_unsigned_sub(a, b); }
-    KFR_SINTRIN u64sse satsub(u64sse a, u64sse b) { return saturated_unsigned_sub(a, b); }
-
-    KFR_HANDLE_ALL(satadd)
-    KFR_HANDLE_ALL(satsub)
-    KFR_HANDLE_SCALAR(satadd)
-    KFR_HANDLE_SCALAR(satsub)
-    KFR_SPEC_FN(in_saturated, satadd)
-    KFR_SPEC_FN(in_saturated, satsub)
-};
-
-template <cpu_t cc>
-struct in_saturated<cpu_t::avx2, cc> : in_saturated<cpu_t::sse2, cc>
+    const vec<T, N> t = allonesvector(a);
+    return select(a > t - b, t, a + b);
+}
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> saturated_unsigned_sub(vec<T, N> a, vec<T, N> b)
 {
-    constexpr static cpu_t cpu = cpu_t::avx2;
-    using in_saturated<cpu_t::sse2, cc>::satadd;
-    using in_saturated<cpu_t::sse2, cc>::satsub;
-
-    KFR_SINTRIN u8avx satadd(u8avx x, u8avx y) { return _mm256_adds_epu8(*x, *y); }
-    KFR_SINTRIN i8avx satadd(i8avx x, i8avx y) { return _mm256_adds_epi8(*x, *y); }
-    KFR_SINTRIN u16avx satadd(u16avx x, u16avx y) { return _mm256_adds_epu16(*x, *y); }
-    KFR_SINTRIN i16avx satadd(i16avx x, i16avx y) { return _mm256_adds_epi16(*x, *y); }
-
-    KFR_SINTRIN u8avx satsub(u8avx x, u8avx y) { return _mm256_subs_epu8(*x, *y); }
-    KFR_SINTRIN i8avx satsub(i8avx x, i8avx y) { return _mm256_subs_epi8(*x, *y); }
-    KFR_SINTRIN u16avx satsub(u16avx x, u16avx y) { return _mm256_subs_epu16(*x, *y); }
-    KFR_SINTRIN i16avx satsub(i16avx x, i16avx y) { return _mm256_subs_epi16(*x, *y); }
-
-    KFR_HANDLE_ALL(satadd)
-    KFR_HANDLE_ALL(satsub)
-    KFR_HANDLE_SCALAR(satadd)
-    KFR_HANDLE_SCALAR(satsub)
-    KFR_SPEC_FN(in_saturated, satadd)
-    KFR_SPEC_FN(in_saturated, satsub)
-};
+    return select(a < b, zerovector(a), a - b);
+}
+
+#if defined CID_ARCH_SSE2
+
+KFR_SINTRIN u8sse satadd(u8sse x, u8sse y) { return _mm_adds_epu8(*x, *y); }
+KFR_SINTRIN i8sse satadd(i8sse x, i8sse y) { return _mm_adds_epi8(*x, *y); }
+KFR_SINTRIN u16sse satadd(u16sse x, u16sse y) { return _mm_adds_epu16(*x, *y); }
+KFR_SINTRIN i16sse satadd(i16sse x, i16sse y) { return _mm_adds_epi16(*x, *y); }
+
+KFR_SINTRIN u8sse satsub(u8sse x, u8sse y) { return _mm_subs_epu8(*x, *y); }
+KFR_SINTRIN i8sse satsub(i8sse x, i8sse y) { return _mm_subs_epi8(*x, *y); }
+KFR_SINTRIN u16sse satsub(u16sse x, u16sse y) { return _mm_subs_epu16(*x, *y); }
+KFR_SINTRIN i16sse satsub(i16sse x, i16sse y) { return _mm_subs_epi16(*x, *y); }
+
+KFR_SINTRIN i32sse satadd(i32sse a, i32sse b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN i64sse satadd(i64sse a, i64sse b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN u32sse satadd(u32sse a, u32sse b) { return saturated_unsigned_add(a, b); }
+KFR_SINTRIN u64sse satadd(u64sse a, u64sse b) { return saturated_unsigned_add(a, b); }
+
+KFR_SINTRIN i32sse satsub(i32sse a, i32sse b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN i64sse satsub(i64sse a, i64sse b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN u32sse satsub(u32sse a, u32sse b) { return saturated_unsigned_sub(a, b); }
+KFR_SINTRIN u64sse satsub(u64sse a, u64sse b) { return saturated_unsigned_sub(a, b); }
+
+#if defined CID_ARCH_AVX2
+KFR_SINTRIN u8avx satadd(u8avx x, u8avx y) { return _mm256_adds_epu8(*x, *y); }
+KFR_SINTRIN i8avx satadd(i8avx x, i8avx y) { return _mm256_adds_epi8(*x, *y); }
+KFR_SINTRIN u16avx satadd(u16avx x, u16avx y) { return _mm256_adds_epu16(*x, *y); }
+KFR_SINTRIN i16avx satadd(i16avx x, i16avx y) { return _mm256_adds_epi16(*x, *y); }
+
+KFR_SINTRIN u8avx satsub(u8avx x, u8avx y) { return _mm256_subs_epu8(*x, *y); }
+KFR_SINTRIN i8avx satsub(i8avx x, i8avx y) { return _mm256_subs_epi8(*x, *y); }
+KFR_SINTRIN u16avx satsub(u16avx x, u16avx y) { return _mm256_subs_epu16(*x, *y); }
+KFR_SINTRIN i16avx satsub(i16avx x, i16avx y) { return _mm256_subs_epi16(*x, *y); }
 #endif
+
+#else
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
+KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b)
+{
+    return saturated_signed_add(a, b);
 }
-namespace native
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
+KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b)
 {
-using fn_satadd = internal::in_saturated<>::fn_satadd;
+    return saturated_unsigned_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
+KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b)
+{
+    return saturated_signed_sub(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
+KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b)
+{
+    return saturated_unsigned_sub(a, b);
+}
+#endif
+KFR_HANDLE_SCALAR_1(satadd)
+KFR_FN(satadd)
+KFR_HANDLE_SCALAR_1(satsub)
+KFR_FN(satsub)
+}
+
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> satadd(const T1& x, const T2& y)
+KFR_INTRIN common_type<T1, T2> satadd(const T1& x, const T2& y)
 {
-    return internal::in_saturated<>::satadd(x, y);
+    return internal::satadd(x, y);
 }
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_satadd, E1, E2> satadd(E1&& x, E2&& y)
+KFR_INTRIN expr_func<internal::fn_satadd, E1, E2> satadd(E1&& x, E2&& y)
 {
-    return { fn_satadd(), std::forward<E1>(x), std::forward<E2>(y) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
-using fn_satsub = internal::in_saturated<>::fn_satsub;
+
 template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_INLINE ftype<common_type<T1, T2>> satsub(const T1& x, const T2& y)
+KFR_INTRIN common_type<T1, T2> satsub(const T1& x, const T2& y)
 {
-    return internal::in_saturated<>::satsub(x, y);
+    return internal::satsub(x, y);
 }
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE expr_func<fn_satsub, E1, E2> satsub(E1&& x, E2&& y)
+KFR_INTRIN expr_func<internal::fn_satsub, E1, E2> satsub(E1&& x, E2&& y)
 {
-    return { fn_satsub(), std::forward<E1>(x), std::forward<E2>(y) };
+    return { {}, std::forward<E1>(x), std::forward<E2>(y) };
 }
 }
-}
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -29,174 +29,71 @@ namespace kfr
 namespace internal
 {
 
-template <cpu_t c>
-struct in_select_impl : in_select_impl<older(c)>
-{
-    struct fn_select : fn_disabled
-    {
-    };
-};
-
-template <>
-struct in_select_impl<cpu_t::common>
-{
-    constexpr static cpu_t cur = cpu_t::common;
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> select(vec<T, N> m, vec<T, N> x, vec<T, N> y)
-    {
-        return y ^ ((x ^ y) & m);
-    }
-    KFR_SPEC_FN(in_select_impl, select)
-};
+#if defined CID_ARCH_SSE41
+
+KFR_SINTRIN u8sse select(mu8sse m, u8sse x, u8sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN u16sse select(mu16sse m, u16sse x, u16sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN u32sse select(mu32sse m, u32sse x, u32sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN u64sse select(mu64sse m, u64sse x, u64sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i8sse select(mi8sse m, i8sse x, i8sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i16sse select(mi16sse m, i16sse x, i16sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i32sse select(mi32sse m, i32sse x, i32sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i64sse select(mi64sse m, i64sse x, i64sse y) { return _mm_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN f32sse select(mf32sse m, f32sse x, f32sse y) { return _mm_blendv_ps(*y, *x, *m); }
+KFR_SINTRIN f64sse select(mf64sse m, f64sse x, f64sse y) { return _mm_blendv_pd(*y, *x, *m); }
+
+#if defined CID_ARCH_AVX
+KFR_SINTRIN f64avx select(mf64avx m, f64avx x, f64avx y) { return _mm256_blendv_pd(*y, *x, *m); }
+KFR_SINTRIN f32avx select(mf32avx m, f32avx x, f32avx y) { return _mm256_blendv_ps(*y, *x, *m); }
+#endif
 
-#ifdef CID_ARCH_X86
+#if defined CID_ARCH_AVX2
+KFR_SINTRIN u8avx select(mu8avx m, u8avx x, u8avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN u16avx select(mu16avx m, u16avx x, u16avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN u32avx select(mu32avx m, u32avx x, u32avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN u64avx select(mu64avx m, u64avx x, u64avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i8avx select(mi8avx m, i8avx x, i8avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i16avx select(mi16avx m, i16avx x, i16avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i32avx select(mi32avx m, i32avx x, i32avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN i64avx select(mi64avx m, i64avx x, i64avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+#endif
 
-template <>
-struct in_select_impl<cpu_t::sse41> : in_select_impl<cpu_t::common>
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c)
 {
-    constexpr static cpu_t cpu = cpu_t::sse41;
-
-    KFR_CPU_INTRIN(sse41) u8sse select(u8sse m, u8sse x, u8sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) u16sse select(u16sse m, u16sse x, u16sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) u32sse select(u32sse m, u32sse x, u32sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) u64sse select(u64sse m, u64sse x, u64sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) i8sse select(i8sse m, i8sse x, i8sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) i16sse select(i16sse m, i16sse x, i16sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) i32sse select(i32sse m, i32sse x, i32sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) i64sse select(i64sse m, i64sse x, i64sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) f32sse select(f32sse m, f32sse x, f32sse y) { return _mm_blendv_ps(*y, *x, *m); }
-    KFR_CPU_INTRIN(sse41) f64sse select(f64sse m, f64sse x, f64sse y) { return _mm_blendv_pd(*y, *x, *m); }
-
-    KFR_HANDLE_ALL(select)
-    KFR_SPEC_FN(in_select_impl, select)
-};
-
-template <>
-struct in_select_impl<cpu_t::avx1> : in_select_impl<cpu_t::sse41>
+    return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c)
 {
-    constexpr static cpu_t cpu = cpu_t::avx1;
-    using in_select_impl<cpu_t::sse41>::select;
-
-    KFR_CPU_INTRIN(avx) f64avx select(f64avx m, f64avx x, f64avx y) { return _mm256_blendv_pd(*y, *x, *m); }
-    KFR_CPU_INTRIN(avx) f32avx select(f32avx m, f32avx x, f32avx y) { return _mm256_blendv_ps(*y, *x, *m); }
+    return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
+}
 
-    KFR_HANDLE_ALL(select)
-    KFR_SPEC_FN(in_select_impl, select)
-};
+#else
 
-template <>
-struct in_select_impl<cpu_t::avx2> : in_select_impl<cpu_t::avx1>
+// fallback
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> select(mask<T, N> m, vec<T, N> x, vec<T, N> y)
 {
-    constexpr static cpu_t cpu = cpu_t::avx2;
-    using in_select_impl<cpu_t::avx1>::select;
-
-    KFR_CPU_INTRIN(avx2) u8avx select(u8avx m, u8avx x, u8avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(avx2) u16avx select(u16avx m, u16avx x, u16avx y)
-    {
-        return _mm256_blendv_epi8(*y, *x, *m);
-    }
-    KFR_CPU_INTRIN(avx2) u32avx select(u32avx m, u32avx x, u32avx y)
-    {
-        return _mm256_blendv_epi8(*y, *x, *m);
-    }
-    KFR_CPU_INTRIN(avx2) u64avx select(u64avx m, u64avx x, u64avx y)
-    {
-        return _mm256_blendv_epi8(*y, *x, *m);
-    }
-    KFR_CPU_INTRIN(avx2) i8avx select(i8avx m, i8avx x, i8avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-    KFR_CPU_INTRIN(avx2) i16avx select(i16avx m, i16avx x, i16avx y)
-    {
-        return _mm256_blendv_epi8(*y, *x, *m);
-    }
-    KFR_CPU_INTRIN(avx2) i32avx select(i32avx m, i32avx x, i32avx y)
-    {
-        return _mm256_blendv_epi8(*y, *x, *m);
-    }
-    KFR_CPU_INTRIN(avx2) i64avx select(i64avx m, i64avx x, i64avx y)
-    {
-        return _mm256_blendv_epi8(*y, *x, *m);
-    }
-
-    KFR_HANDLE_ALL(select)
-    KFR_SPEC_FN(in_select_impl, select)
-};
-
+    return y ^ ((x ^ y) & m);
+}
 #endif
 
-template <cpu_t c = cpu_t::native>
-struct in_select : in_select_impl<c>
-{
-    using in_select_impl<c>::select;
-
-    template <typename T, size_t N, typename M>
-    KFR_SINTRIN vec<T, N> select(mask<M, N> m, vec<T, N> x, vec<T, N> y)
-    {
-        static_assert(sizeof(M) == sizeof(T), "select: Incompatible types");
-        return in_select_impl<c>::select(bitcast<T>(m), x, y);
-    }
-    template <typename T, size_t N, typename M>
-    KFR_SINTRIN vec<T, N> select(mask<M, N> m, mask<T, N> x, mask<T, N> y)
-    {
-        static_assert(sizeof(M) == sizeof(T), "select: Incompatible types");
-        return in_select_impl<c>::select(bitcast<T>(m), ref_cast<vec<T, N>>(x), ref_cast<vec<T, N>>(y));
-    }
-
-    template <typename T, size_t N, typename M>
-    KFR_SINTRIN vec<T, N> select(mask<M, N> m, T x, T y)
-    {
-        static_assert(sizeof(M) == sizeof(T), "select: Incompatible types");
-        return in_select_impl<c>::select(bitcast<T>(m), broadcast<N>(x), broadcast<N>(y));
-    }
-
-    template <typename T, size_t N, typename M>
-    KFR_SINTRIN vec<T, N> select(mask<M, N> m, vec<T, N> x, T y)
-    {
-        static_assert(sizeof(M) == sizeof(T), "select: Incompatible types");
-        return in_select_impl<c>::select(bitcast<T>(m), x, broadcast<N>(y));
-    }
-
-    template <typename T, size_t N, typename M>
-    KFR_SINTRIN vec<T, N> select(mask<M, N> m, T x, vec<T, N> y)
-    {
-        static_assert(sizeof(M) == sizeof(T), "select: Incompatible types");
-        return in_select_impl<c>::select(bitcast<T>(m), broadcast<N>(x), y);
-    }
-    template <typename T, size_t N, typename M>
-    KFR_SINTRIN vec<T, N> select(mask<M, N> m, mask<T, N> x, T y)
-    {
-        static_assert(sizeof(M) == sizeof(T), "select: Incompatible types");
-        return in_select_impl<c>::select(bitcast<T>(m), ref_cast<vec<T, N>>(x), broadcast<N>(y));
-    }
-
-    template <typename T, size_t N, typename M>
-    KFR_SINTRIN vec<T, N> select(mask<M, N> m, T x, mask<T, N> y)
-    {
-        static_assert(sizeof(M) == sizeof(T), "select: Incompatible types");
-        return in_select_impl<c>::select(m, broadcast<N>(x), ref_cast<vec<T, N>>(y));
-    }
-    KFR_SPEC_FN(in_select, select)
-
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> sign(vec<T, N> x)
-    {
-        return select(x > T(), T(1), select(x < T(), T(-1), T(0)));
-    }
-};
+KFR_I_FN(select)
 }
 
-namespace native
+template <typename T1, size_t N, typename T2, typename T3,
+          KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
+          typename Tout = subtype<common_type<T2, T3>>>
+KFR_INTRIN vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
 {
-using fn_select = internal::in_select<>::fn_select;
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INLINE ftype<common_type<T2, T3>> select(const T1& arg1, const T2& arg2, const T3& arg3)
-{
-    return internal::in_select<>::select(arg1, arg2, arg3);
+    static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
+    return internal::select(bitcast<Tout>(m).asmask(), static_cast<vec<Tout, N>>(x), static_cast<vec<Tout, N>>(y));
 }
+
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INLINE expr_func<fn_select, E1, E2, E3> select(E1&& arg1, E2&& arg2, E3&& arg3)
+KFR_INTRIN expr_func<internal::fn_select, E1, E2, E3> select(E1&& m, E2&& x, E3&& y)
 {
-    return { fn_select(), std::forward<E1>(arg1), std::forward<E2>(arg2), std::forward<E3>(arg3) };
-}
+    return { {}, std::forward<E1>(m), std::forward<E2>(x), std::forward<E3>(y) };
 }
 }
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -31,10 +31,6 @@
 #include "select.hpp"
 #include "shuffle.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
 #if CID_HAS_WARNING("-Wc99-extensions")
 #pragma clang diagnostic ignored "-Wc99-extensions"
 #endif
@@ -45,517 +41,402 @@ namespace kfr
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_trig : in_select<cc>
-{
-private:
-    using in_select<cc>::select;
-
-protected:
-    template <typename T, size_t N>
-    KFR_SINTRIN vec<T, N> mask_horner(vec<T, N>, mask<T, N> msk, T a0, T b0)
-    {
-        return select(msk, a0, b0);
-    }
-
-    template <typename T, size_t N, typename... Ts>
-    KFR_SINTRIN vec<T, N> mask_horner(vec<T, N> x, mask<T, N> msk, T a0, T b0, T a1, T b1, Ts... values)
-    {
-        return fmadd(mask_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0));
-    }
-};
-
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_sin_cos : private in_trig<cc>, private in_select<cc>, private in_round<cc>, private in_abs<cc>
-{
-
-private:
-    using in_abs<cc>::abs;
-    using in_round<cc>::floor;
-    using in_select<cc>::select;
-    using in_trig<cc>::mask_horner;
-
-    template <typename T, size_t N, typename Tprecise = f64>
-    KFR_SINTRIN vec<T, N> trig_fold(vec<T, N> x, vec<itype<T>, N>& quadrant)
-    {
-        const vec<T, N> xabs    = abs(x);
-        constexpr vec<T, N> div = fold_constant_div<T>;
-        vec<T, N> y             = floor(xabs / div);
-        quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
-
-        const mask<T, N> msk = bitcast<T>((quadrant & 1) != 0);
-        quadrant = select(msk, quadrant + 1, quadrant);
-        y        = select(msk, y + T(1.0), y);
-        quadrant = quadrant & 7;
-
-        constexpr vec<Tprecise, N> hi = cast<Tprecise>(fold_constant_hi<T>);
-        constexpr vec<T, N> rem1      = fold_constant_rem1<T>;
-        constexpr vec<T, N> rem2      = fold_constant_rem2<T>;
-        return cast<T>(cast<Tprecise>(xabs) - cast<Tprecise>(y) * hi) - y * rem1 - y * rem2;
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> trig_sincos(vec<f32, N> folded, mask<f32, N> cosmask)
-    {
-        constexpr f32 sin_c2  = -0x2.aaaaacp-4f;
-        constexpr f32 sin_c4  = 0x2.222334p-8f;
-        constexpr f32 sin_c6  = -0xd.0566ep-16f;
-        constexpr f32 sin_c8  = 0x3.64cc1cp-20f;
-        constexpr f32 sin_c10 = -0x5.6c4a4p-24f;
-        constexpr f32 cos_c2  = -0x8.p-4f;
-        constexpr f32 cos_c4  = 0xa.aaaabp-8f;
-        constexpr f32 cos_c6  = -0x5.b05d48p-12f;
-        constexpr f32 cos_c8  = 0x1.a065f8p-16f;
-        constexpr f32 cos_c10 = -0x4.cd156p-24f;
-
-        const vec<f32, N> x2 = folded * folded;
-
-        vec<f32, N> formula = mask_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6,
-                                          sin_c6, cos_c8, sin_c8, cos_c10, sin_c10);
-
-        formula = select(cosmask, formula, formula * folded);
-        return formula;
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> trig_sincos(vec<f64, N> folded, mask<f64, N> cosmask)
-    {
-        constexpr f64 sin_c2  = -0x2.aaaaaaaaaaaaap-4;
-        constexpr f64 sin_c4  = 0x2.22222222220cep-8;
-        constexpr f64 sin_c6  = -0xd.00d00cffd6618p-16;
-        constexpr f64 sin_c8  = 0x2.e3bc744fb879ep-20;
-        constexpr f64 sin_c10 = -0x6.b99034c1467a4p-28;
-        constexpr f64 sin_c12 = 0xb.0711ea8fe8ee8p-36;
-        constexpr f64 sin_c14 = -0xb.7e010897e55dp-44;
-        constexpr f64 sin_c16 = -0xb.64eac07f1d6bp-48;
-        constexpr f64 cos_c2  = -0x8.p-4;
-        constexpr f64 cos_c4  = 0xa.aaaaaaaaaaaa8p-8;
-        constexpr f64 cos_c6  = -0x5.b05b05b05ad28p-12;
-        constexpr f64 cos_c8  = 0x1.a01a01a0022e6p-16;
-        constexpr f64 cos_c10 = -0x4.9f93ed845de2cp-24;
-        constexpr f64 cos_c12 = 0x8.f76bc015abe48p-32;
-        constexpr f64 cos_c14 = -0xc.9bf2dbe00379p-40;
-        constexpr f64 cos_c16 = 0xd.1232ac32f7258p-48;
-
-        vec<f64, N> x2 = folded * folded;
-        vec<f64, N> formula =
-            mask_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8,
-                        cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16);
-
-        formula = select(cosmask, formula, formula * folded);
-        return formula;
-    }
-
-    template <typename T, size_t N, typename = u8[N > 1]>
-    KFR_SINTRIN vec<T, N> sincos_mask(vec<T, N> x_full, mask<T, N> cosmask)
-    {
-        vec<itype<T>, N> quadrant;
-        vec<T, N> folded = trig_fold(x_full, quadrant);
-
-        mask<T, N> flip_sign = select(cosmask, (quadrant == 2) || (quadrant == 4), quadrant >= 4);
-
-        mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
-        usecos = usecos ^ cosmask;
-
-        vec<T, N> formula = trig_sincos(folded, usecos);
-
-        mask<T, N> negmask = x_full < 0;
-
-        flip_sign = flip_sign ^ (negmask & ~cosmask);
-
-        formula = select(flip_sign, -formula, formula);
-        return formula;
-    }
-
-    template <typename T>
-    constexpr static T fold_constant_div = choose_const<T>(0x1.921fb6p-1f, 0x1.921fb54442d18p-1);
-
-    template <typename T>
-    constexpr static T fold_constant_hi = choose_const<T>(0x1.922000p-1f, 0x1.921fb40000000p-1);
-    template <typename T>
-    constexpr static T fold_constant_rem1 = choose_const<T>(-0x1.2ae000p-19f, 0x1.4442d00000000p-25);
-    template <typename T>
-    constexpr static T fold_constant_rem2 = choose_const<T>(-0x1.de973ep-32f, 0x1.8469898cc5170p-49);
-    constexpr static cpu_t cur            = c;
-
-public:
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> sin(vec<T, N> x)
-    {
-        vec<itype<T>, N> quadrant;
-        vec<T, N> folded = trig_fold(x, quadrant);
-
-        mask<T, N> flip_sign = quadrant >= 4;
-        mask<T, N> usecos    = (quadrant == 2) || (quadrant == 6);
-
-        vec<T, N> formula = trig_sincos(folded, usecos);
-
-        formula = select(flip_sign ^ x.asmask(), -formula, formula);
-        return formula;
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> cos(vec<T, N> x)
-    {
-        vec<itype<T>, N> quadrant;
-        vec<T, N> folded = trig_fold(x, quadrant);
-
-        mask<T, N> eq4       = (quadrant == 4);
-        mask<T, N> flip_sign = (quadrant == 2) || eq4;
-        mask<T, N> usecos    = (quadrant == 0) || eq4;
-
-        vec<T, N> formula = trig_sincos(folded, usecos);
-
-        formula = select(flip_sign, -formula, formula);
-        return formula;
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> fastsin(vec<T, N> x)
-    {
-        constexpr vec<T, N> msk = broadcast<N>(highbitmask<T>);
-
-        constexpr static T c2 = -0.16665853559970855712890625;
-        constexpr static T c4 = +8.31427983939647674560546875e-3;
-        constexpr static T c6 = -1.85423981747590005397796630859375e-4;
-
-        const vec<T, N> pi = c_pi<T>;
-
-        x -= pi;
-        vec<T, N> y = abs(x);
-        y = select(y > c_pi<T, 1, 2>, pi - y, y);
-        y = y ^ (msk & ~x);
-
-        vec<T, N> y2      = y * y;
-        vec<T, N> formula = c6;
-        vec<T, N> y3      = y2 * y;
-        formula = fmadd(formula, y2, c4);
-        formula = fmadd(formula, y2, c2);
-        formula = formula * y3 + y;
-        return formula;
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> fastcos(vec<T, N> x)
-    {
-        x += c_pi<T, 1, 2>;
-        x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x);
-        return fastsin(x);
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> sincos(vec<T, N> x)
-    {
-        return sincos_mask(x, internal::oddmask<T, N>());
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> cossin(vec<T, N> x)
-    {
-        return sincos_mask(x, internal::evenmask<T, N>());
-    }
-
-    template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-    KFR_SINTRIN vec<T, N> sinc(vec<T, N> x)
-    {
-        return select(abs(x) <= c_epsilon<T>, T(1), sin(x) / x);
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> sin(vec<T, N> x)
-    {
-        return sin(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> cos(vec<T, N> x)
-    {
-        return cos(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> fastsin(vec<T, N> x)
-    {
-        return fastsin(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> fastcos(vec<T, N> x)
-    {
-        return fastcos(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> sincos(vec<T, N> x)
-    {
-        return sincos(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> cossin(vec<T, N> x)
-    {
-        return cossin(cast<Tout>(x));
-    }
-    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-    KFR_SINTRIN vec<Tout, N> sinc(vec<T, N> x)
-    {
-        return sinc(cast<Tout>(x));
-    }
-
-    template <typename T>
-    KFR_SINTRIN T sindeg(const T& x)
-    {
-        return sin(x * c_degtorad<T>);
-    }
-    template <typename T>
-    KFR_SINTRIN T cosdeg(const T& x)
-    {
-        return cos(x * c_degtorad<T>);
-    }
-
-    template <typename T>
-    KFR_SINTRIN T fastsindeg(const T& x)
-    {
-        return fastsin(x * c_degtorad<T>);
-    }
-    template <typename T>
-    KFR_SINTRIN T fastcosdeg(const T& x)
-    {
-        return fastcos(x * c_degtorad<T>);
-    }
-
-    template <typename T>
-    KFR_SINTRIN T sincosdeg(const T& x)
-    {
-        return sincos(x * c_degtorad<T>);
-    }
-    template <typename T>
-    KFR_SINTRIN T cossindeg(const T& x)
-    {
-        return cossin(x * c_degtorad<T>);
-    }
-
-    KFR_HANDLE_SCALAR(sin)
-    KFR_HANDLE_SCALAR(cos)
-    KFR_HANDLE_SCALAR(fastsin)
-    KFR_HANDLE_SCALAR(fastcos)
-    KFR_HANDLE_SCALAR(sincos)
-    KFR_HANDLE_SCALAR(cossin)
-    KFR_HANDLE_SCALAR(sinc)
-
-    KFR_SPEC_FN(in_sin_cos, sin)
-    KFR_SPEC_FN(in_sin_cos, cos)
-    KFR_SPEC_FN(in_sin_cos, fastsin)
-    KFR_SPEC_FN(in_sin_cos, fastcos)
-    KFR_SPEC_FN(in_sin_cos, sincos_mask)
-    KFR_SPEC_FN(in_sin_cos, sincos)
-    KFR_SPEC_FN(in_sin_cos, cossin)
-    KFR_SPEC_FN(in_sin_cos, sinc)
-    KFR_SPEC_FN(in_sin_cos, sindeg)
-    KFR_SPEC_FN(in_sin_cos, cosdeg)
-    KFR_SPEC_FN(in_sin_cos, fastsindeg)
-    KFR_SPEC_FN(in_sin_cos, fastcosdeg)
-    KFR_SPEC_FN(in_sin_cos, sincosdeg)
-    KFR_SPEC_FN(in_sin_cos, cossindeg)
-};
-}
-
-namespace native
-{
-using fn_sin = internal::in_sin_cos<>::fn_sin;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sin(const T1& x)
+template <typename T>
+constexpr static T fold_constant_div = choose_const<T>(0x1.921fb6p-1f, 0x1.921fb54442d18p-1);
+
+template <typename T>
+constexpr static T fold_constant_hi = choose_const<T>(0x1.922000p-1f, 0x1.921fb40000000p-1);
+template <typename T>
+constexpr static T fold_constant_rem1 = choose_const<T>(-0x1.2ae000p-19f, 0x1.4442d00000000p-25);
+template <typename T>
+constexpr static T fold_constant_rem2 = choose_const<T>(-0x1.de973ep-32f, 0x1.8469898cc5170p-49);
+
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> trig_horner(vec<T, N>, mask<T, N> msk, T a0, T b0)
 {
-    return internal::in_sin_cos<>::sin(x);
+    return select(msk, a0, b0);
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sin, E1> sin(E1&& x)
+template <typename T, size_t N, typename... Ts>
+KFR_SINTRIN vec<T, N> trig_horner(vec<T, N> x, mask<T, N> msk, T a0, T b0, T a1, T b1, Ts... values)
 {
-    return { fn_sin(), std::forward<E1>(x) };
+    return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0));
 }
 
-using fn_cos = internal::in_sin_cos<>::fn_cos;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cos(const T1& x)
+template <typename T, size_t N, typename Tprecise = f64>
+KFR_SINTRIN vec<T, N> trig_fold(vec<T, N> x, vec<itype<T>, N>& quadrant)
 {
-    return internal::in_sin_cos<>::cos(x);
+    const vec<T, N> xabs    = abs(x);
+    constexpr vec<T, N> div = fold_constant_div<T>;
+    vec<T, N> y             = floor(xabs / div);
+    quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
+
+    const mask<T, N> msk = bitcast<T>((quadrant & 1) != 0);
+    quadrant = kfr::select(msk, quadrant + 1, quadrant);
+    y        = select(msk, y + T(1.0), y);
+    quadrant = quadrant & 7;
+
+    constexpr vec<Tprecise, N> hi = cast<Tprecise>(fold_constant_hi<T>);
+    constexpr vec<T, N> rem1      = fold_constant_rem1<T>;
+    constexpr vec<T, N> rem2      = fold_constant_rem2<T>;
+    return cast<T>(cast<Tprecise>(xabs) - cast<Tprecise>(y) * hi) - y * rem1 - y * rem2;
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cos, E1> cos(E1&& x)
+template <size_t N>
+KFR_SINTRIN vec<f32, N> trig_sincos(vec<f32, N> folded, mask<f32, N> cosmask)
 {
-    return { fn_cos(), std::forward<E1>(x) };
+    constexpr f32 sin_c2  = -0x2.aaaaacp-4f;
+    constexpr f32 sin_c4  = 0x2.222334p-8f;
+    constexpr f32 sin_c6  = -0xd.0566ep-16f;
+    constexpr f32 sin_c8  = 0x3.64cc1cp-20f;
+    constexpr f32 sin_c10 = -0x5.6c4a4p-24f;
+    constexpr f32 cos_c2  = -0x8.p-4f;
+    constexpr f32 cos_c4  = 0xa.aaaabp-8f;
+    constexpr f32 cos_c6  = -0x5.b05d48p-12f;
+    constexpr f32 cos_c8  = 0x1.a065f8p-16f;
+    constexpr f32 cos_c10 = -0x4.cd156p-24f;
+
+    const vec<f32, N> x2 = folded * folded;
+
+    vec<f32, N> formula = trig_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6,
+                                      cos_c8, sin_c8, cos_c10, sin_c10);
+
+    formula = select(cosmask, formula, formula * folded);
+    return formula;
 }
-using fn_fastsin = internal::in_sin_cos<>::fn_fastsin;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> fastsin(const T1& x)
+
+template <size_t N>
+KFR_SINTRIN vec<f64, N> trig_sincos(vec<f64, N> folded, mask<f64, N> cosmask)
 {
-    return internal::in_sin_cos<>::fastsin(x);
+    constexpr f64 sin_c2  = -0x2.aaaaaaaaaaaaap-4;
+    constexpr f64 sin_c4  = 0x2.22222222220cep-8;
+    constexpr f64 sin_c6  = -0xd.00d00cffd6618p-16;
+    constexpr f64 sin_c8  = 0x2.e3bc744fb879ep-20;
+    constexpr f64 sin_c10 = -0x6.b99034c1467a4p-28;
+    constexpr f64 sin_c12 = 0xb.0711ea8fe8ee8p-36;
+    constexpr f64 sin_c14 = -0xb.7e010897e55dp-44;
+    constexpr f64 sin_c16 = -0xb.64eac07f1d6bp-48;
+    constexpr f64 cos_c2  = -0x8.p-4;
+    constexpr f64 cos_c4  = 0xa.aaaaaaaaaaaa8p-8;
+    constexpr f64 cos_c6  = -0x5.b05b05b05ad28p-12;
+    constexpr f64 cos_c8  = 0x1.a01a01a0022e6p-16;
+    constexpr f64 cos_c10 = -0x4.9f93ed845de2cp-24;
+    constexpr f64 cos_c12 = 0x8.f76bc015abe48p-32;
+    constexpr f64 cos_c14 = -0xc.9bf2dbe00379p-40;
+    constexpr f64 cos_c16 = 0xd.1232ac32f7258p-48;
+
+    vec<f64, N> x2 = folded * folded;
+    vec<f64, N> formula =
+        trig_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8,
+                    cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16);
+
+    formula = select(cosmask, formula, formula * folded);
+    return formula;
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_fastsin, E1> fastsin(E1&& x)
+template <typename T, size_t N, typename = u8[N > 1]>
+KFR_SINTRIN vec<T, N> sincos_mask(vec<T, N> x_full, mask<T, N> cosmask)
 {
-    return { fn_fastsin(), std::forward<E1>(x) };
+    vec<itype<T>, N> quadrant;
+    vec<T, N> folded = trig_fold(x_full, quadrant);
+
+    mask<T, N> flip_sign = select(cosmask, (quadrant == 2) || (quadrant == 4), quadrant >= 4);
+
+    mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
+    usecos = usecos ^ cosmask;
+
+    vec<T, N> formula = trig_sincos(folded, usecos);
+
+    mask<T, N> negmask = x_full < 0;
+
+    flip_sign = flip_sign ^ (negmask & ~cosmask);
+
+    formula = select(flip_sign, -formula, formula);
+    return formula;
 }
 
-using fn_fastcos = internal::in_sin_cos<>::fn_fastcos;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> fastcos(const T1& x)
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> sin(vec<T, N> x)
 {
-    return internal::in_sin_cos<>::fastcos(x);
+    vec<itype<T>, N> quadrant;
+    vec<T, N> folded = trig_fold(x, quadrant);
+
+    mask<T, N> flip_sign = quadrant >= 4;
+    mask<T, N> usecos    = (quadrant == 2) || (quadrant == 6);
+
+    vec<T, N> formula = trig_sincos(folded, usecos);
+
+    formula = select(flip_sign ^ x.asmask(), -formula, formula);
+    return formula;
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_fastcos, E1> fastcos(E1&& x)
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> cos(vec<T, N> x)
 {
-    return { fn_fastcos(), std::forward<E1>(x) };
+    vec<itype<T>, N> quadrant;
+    vec<T, N> folded = trig_fold(x, quadrant);
+
+    mask<T, N> eq4       = (quadrant == 4);
+    mask<T, N> flip_sign = (quadrant == 2) || eq4;
+    mask<T, N> usecos    = (quadrant == 0) || eq4;
+
+    vec<T, N> formula = trig_sincos(folded, usecos);
+
+    formula = select(flip_sign, -formula, formula);
+    return formula;
 }
 
-using fn_sincos_mask = internal::in_sin_cos<>::fn_sincos_mask;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sincos_mask(const T1& x)
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> fastsin(vec<T, N> x)
+{
+    constexpr vec<T, N> msk = broadcast<N>(internal::highbitmask<T>);
+
+    constexpr static T c2 = -0.16665853559970855712890625;
+    constexpr static T c4 = +8.31427983939647674560546875e-3;
+    constexpr static T c6 = -1.85423981747590005397796630859375e-4;
+
+    const vec<T, N> pi = c_pi<T>;
+
+    x -= pi;
+    vec<T, N> y = abs(x);
+    y = select(y > c_pi<T, 1, 2>, pi - y, y);
+    y = y ^ (msk & ~x);
+
+    vec<T, N> y2      = y * y;
+    vec<T, N> formula = c6;
+    vec<T, N> y3      = y2 * y;
+    formula = fmadd(formula, y2, c4);
+    formula = fmadd(formula, y2, c2);
+    formula = formula * y3 + y;
+    return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> fastcos(vec<T, N> x)
 {
-    return internal::in_sin_cos<>::sincos_mask(x);
+    x += c_pi<T, 1, 2>;
+    x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x);
+    return fastsin(x);
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sincos_mask, E1> sincos_mask(E1&& x)
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> sincos(vec<T, N> x)
 {
-    return { fn_sincos_mask(), std::forward<E1>(x) };
+    return sincos_mask(x, internal::oddmask<T, N>());
 }
 
-using fn_sincos = internal::in_sin_cos<>::fn_sincos;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sincos(const T1& x)
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> cossin(vec<T, N> x)
 {
-    return internal::in_sin_cos<>::sincos(x);
+    return sincos_mask(x, internal::evenmask<T, N>());
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sincos, E1> sincos(E1&& x)
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> sinc(vec<T, N> x)
 {
-    return { fn_sincos(), std::forward<E1>(x) };
+    return select(abs(x) <= c_epsilon<T>, T(1), sin(x) / x);
 }
 
-using fn_cossin = internal::in_sin_cos<>::fn_cossin;
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cossin(const T1& x)
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> sin(vec<T, N> x)
 {
-    return internal::in_sin_cos<>::cossin(x);
+    return sin(cast<Tout>(x));
 }
 
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cossin, E1> cossin(E1&& x)
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> cos(vec<T, N> x)
+{
+    return cos(cast<Tout>(x));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> fastsin(vec<T, N> x)
 {
-    return { fn_cossin(), std::forward<E1>(x) };
+    return fastsin(cast<Tout>(x));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> fastcos(vec<T, N> x)
+{
+    return fastcos(cast<Tout>(x));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> sincos(vec<T, N> x)
+{
+    return sincos(cast<Tout>(x));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> cossin(vec<T, N> x)
+{
+    return cossin(cast<Tout>(x));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
+KFR_SINTRIN vec<Tout, N> sinc(vec<T, N> x)
+{
+    return sinc(cast<Tout>(x));
+}
+
+template <typename T>
+KFR_SINTRIN T sindeg(const T& x)
+{
+    return sin(x * c_degtorad<T>);
+}
+
+template <typename T>
+KFR_SINTRIN T cosdeg(const T& x)
+{
+    return cos(x * c_degtorad<T>);
+}
+
+template <typename T>
+KFR_SINTRIN T fastsindeg(const T& x)
+{
+    return fastsin(x * c_degtorad<T>);
+}
+
+template <typename T>
+KFR_SINTRIN T fastcosdeg(const T& x)
+{
+    return fastcos(x * c_degtorad<T>);
+}
+
+template <typename T>
+KFR_SINTRIN T sincosdeg(const T& x)
+{
+    return sincos(x * c_degtorad<T>);
+}
+
+template <typename T>
+KFR_SINTRIN T cossindeg(const T& x)
+{
+    return cossin(x * c_degtorad<T>);
+}
+
+KFR_HANDLE_SCALAR_1(sin)
+KFR_HANDLE_SCALAR_1(cos)
+KFR_HANDLE_SCALAR_1(fastsin)
+KFR_HANDLE_SCALAR_1(fastcos)
+KFR_HANDLE_SCALAR_1(sincos)
+KFR_HANDLE_SCALAR_1(cossin)
+KFR_HANDLE_SCALAR_1(sinc)
+
+KFR_FN(sin)
+KFR_FN(cos)
+KFR_FN(fastsin)
+KFR_FN(fastcos)
+KFR_FN(sincos)
+KFR_FN(cossin)
+KFR_FN(sinc)
 }
 
-using fn_sindeg = internal::in_sin_cos<>::fn_sindeg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sindeg(const T1& x)
+KFR_INTRIN ftype<T1> sin(const T1& x)
 {
-    return internal::in_sin_cos<>::sindeg(x);
+    return internal::sin(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sindeg, E1> sindeg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sin, E1> sin(E1&& x)
 {
-    return { fn_sindeg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_cosdeg = internal::in_sin_cos<>::fn_cosdeg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cosdeg(const T1& x)
+KFR_INTRIN ftype<T1> cos(const T1& x)
 {
-    return internal::in_sin_cos<>::cosdeg(x);
+    return internal::cos(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cosdeg, E1> cosdeg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cos, E1> cos(E1&& x)
 {
-    return { fn_cosdeg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_fastsindeg = internal::in_sin_cos<>::fn_fastsindeg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> fastsindeg(const T1& x)
+KFR_INTRIN ftype<T1> fastsin(const T1& x)
 {
-    return internal::in_sin_cos<>::fastsindeg(x);
+    return internal::fastsin(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_fastsindeg, E1> fastsindeg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_fastsin, E1> fastsin(E1&& x)
 {
-    return { fn_fastsindeg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_fastcosdeg = internal::in_sin_cos<>::fn_fastcosdeg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> fastcosdeg(const T1& x)
+KFR_INTRIN ftype<T1> fastcos(const T1& x)
 {
-    return internal::in_sin_cos<>::fastcosdeg(x);
+    return internal::fastcos(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_fastcosdeg, E1> fastcosdeg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_fastcos, E1> fastcos(E1&& x)
 {
-    return { fn_fastcosdeg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_sincosdeg = internal::in_sin_cos<>::fn_sincosdeg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sincosdeg(const T1& x)
+KFR_INTRIN ftype<T1> sincos(const T1& x)
 {
-    return internal::in_sin_cos<>::sincosdeg(x);
+    return internal::sincos(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sincosdeg, E1> sincosdeg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sincos, E1> sincos(E1&& x)
 {
-    return { fn_sincosdeg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_cossindeg = internal::in_sin_cos<>::fn_cossindeg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> cossindeg(const T1& x)
+KFR_INTRIN ftype<T1> cossin(const T1& x)
 {
-    return internal::in_sin_cos<>::cossindeg(x);
+    return internal::cossin(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_cossindeg, E1> cossindeg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_cossin, E1> cossin(E1&& x)
 {
-    return { fn_cossindeg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_sinc = internal::in_sin_cos<>::fn_sinc;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
 KFR_INTRIN ftype<T1> sinc(const T1& x)
 {
-    return internal::in_sin_cos<>::sinc(x);
+    return internal::sinc(x);
 }
+
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sinc, E1> sinc(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sinc, E1> sinc(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
 
 template <typename T>
-inline T sin2x(const T& sinx, const T& cosx)
+KFR_SINTRIN T sin2x(const T& sinx, const T& cosx)
 {
     return 2 * sinx * cosx;
 }
+
 template <typename T>
-inline T sin3x(const T& sinx, const T& cosx)
+KFR_SINTRIN T sin3x(const T& sinx, const T& cosx)
 {
     return sinx * (-1 + 4 * sqr(cosx));
 }
 
 template <typename T>
-inline T cos2x(const T& sinx, const T& cosx)
+KFR_SINTRIN T cos2x(const T& sinx, const T& cosx)
 {
     return sqr(cosx) - sqr(sinx);
 }
+
 template <typename T>
-inline T cos3x(const T& sinx, const T& cosx)
+KFR_SINTRIN T cos3x(const T& sinx, const T& cosx)
 {
     return cosx * (1 - 4 * sqr(sinx));
 }
 }
-}
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp
@@ -30,77 +30,42 @@ namespace kfr
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native>
-struct in_sqrt : in_sqrt<older(c)>
-{
-    struct fn_sqrt : fn_disabled
-    {
-    };
-};
-
-template <>
-struct in_sqrt<cpu_t::common>
-{
-    constexpr static cpu_t cpu = cpu_t::common;
-
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> sqrt(vec<f32, N> x)
-    {
-        return apply([](float xx) { return std::sqrt(xx); }, x);
-    }
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> sqrt(vec<f64, N> x)
-    {
-        return apply([](double xx) { return std::sqrt(xx); }, x);
-    }
+#if defined CID_ARCH_SSE2
 
-    KFR_HANDLE_SCALAR(sqrt)
-    KFR_SPEC_FN(in_sqrt, sqrt)
-};
+KFR_SINTRIN f32x1 sqrt(f32x1 x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); }
+KFR_SINTRIN f64x1 sqrt(f64x1 x) { return slice<0, 1>(tovec(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x)))); }
+KFR_SINTRIN f32sse sqrt(f32sse x) { return _mm_sqrt_ps(*x); }
+KFR_SINTRIN f64sse sqrt(f64sse x) { return _mm_sqrt_pd(*x); }
 
-#ifdef CID_ARCH_X86
-
-template <>
-struct in_sqrt<cpu_t::sse2>
-{
-    constexpr static cpu_t cpu = cpu_t::sse2;
+#if defined CID_ARCH_AVX
+KFR_SINTRIN f32avx sqrt(f32avx x) { return _mm256_sqrt_ps(*x); }
+KFR_SINTRIN f64avx sqrt(f64avx x) { return _mm256_sqrt_pd(*x); }
+#endif
 
-    KFR_SINTRIN f32sse sqrt(f32sse x) { return _mm_sqrt_ps(*x); }
-    KFR_SINTRIN f64sse sqrt(f64sse x) { return _mm_sqrt_pd(*x); }
+KFR_HANDLE_ALL_SIZES_1(sqrt)
 
-    KFR_HANDLE_ALL(sqrt)
-    KFR_HANDLE_SCALAR(sqrt)
-    KFR_SPEC_FN(in_sqrt, sqrt)
-};
+#else
 
-template <>
-struct in_sqrt<cpu_t::avx1> : in_sqrt<cpu_t::sse2>
+// fallback
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> sqrt(vec<T, N> x)
 {
-    constexpr static cpu_t cpu = cpu_t::avx1;
-    using in_sqrt<cpu_t::sse2>::sqrt;
-
-    KFR_SINTRIN f32avx KFR_USE_CPU(avx) sqrt(f32avx x) { return _mm256_sqrt_ps(*x); }
-    KFR_SINTRIN f64avx KFR_USE_CPU(avx) sqrt(f64avx x) { return _mm256_sqrt_pd(*x); }
-
-    KFR_HANDLE_ALL(sqrt)
-    KFR_HANDLE_SCALAR(sqrt)
-    KFR_SPEC_FN(in_sqrt, sqrt)
-};
+    return apply([](T x) { return std::sqrt(x); }, x);
+}
 #endif
+KFR_HANDLE_SCALAR_1(sqrt)
+KFR_FN(sqrt)
 }
-namespace native
-{
-using fn_sqrt = internal::in_sqrt<>::fn_sqrt;
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
 KFR_INTRIN ftype<T1> sqrt(const T1& x)
 {
-    return internal::in_sqrt<>::sqrt(x);
+    return internal::sqrt(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sqrt, E1> sqrt(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sqrt, E1> sqrt(E1&& x)
 {
-    return { fn_sqrt(), std::forward<E1>(x) };
-}
+    return { {}, std::forward<E1>(x) };
 }
 }
diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp
@@ -28,156 +28,129 @@
 #include "select.hpp"
 #include "sin_cos.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-#if CID_HAS_WARNING("-Wc99-extensions")
-#pragma clang diagnostic ignored "-Wc99-extensions"
-#endif
-
 namespace kfr
 {
 
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_tan : in_trig<cc>, in_select<cc>, in_round<cc>, in_abs<cc>
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_SINTRIN vec<T, N> trig_fold_simple(vec<T, N> x_full, mask<T, N>& inverse)
+{
+    constexpr T pi_14 = c_pi<T, 1, 4>;
+
+    vec<T, N> y      = abs(x_full);
+    vec<T, N> scaled = y / pi_14;
+
+    vec<T, N> k_real = floor(scaled);
+    vec<IT, N> k     = cast<IT>(k_real);
+
+    vec<T, N> x = y - k_real * pi_14;
+
+    mask<T, N> need_offset = (k & 1) != 0;
+    x = select(need_offset, x - pi_14, x);
+
+    vec<IT, N> k_mod4 = k & 3;
+    inverse = (k_mod4 == 1) || (k_mod4 == 2);
+    return x;
+}
+
+template <size_t N>
+KFR_SINTRIN vec<f32, N> tan(vec<f32, N> x_full)
 {
-private:
-    using in_abs<cc>::abs;
-    using in_round<cc>::floor;
-    using in_select<cc>::select;
-    using in_trig<cc>::mask_horner;
-
-    template <typename T, size_t N, typename IT = itype<T>>
-    KFR_SINTRIN vec<T, N> trig_fold(vec<T, N> x_full, mask<T, N>& inverse)
-    {
-        constexpr T pi_14 = c_pi<T, 1, 4>;
-
-        vec<T, N> y      = abs(x_full);
-        vec<T, N> scaled = y / pi_14;
-
-        vec<T, N> k_real = floor(scaled);
-        vec<IT, N> k     = cast<IT>(k_real);
-
-        vec<T, N> x = y - k_real * pi_14;
-
-        mask<T, N> need_offset = (k & 1) != 0;
-        x = select(need_offset, x - pi_14, x);
-
-        vec<IT, N> k_mod4 = k & 3;
-        inverse = (k_mod4 == 1) || (k_mod4 == 2);
-        return x;
-    }
-
-public:
-    template <size_t N>
-    KFR_SINTRIN vec<f32, N> tan(vec<f32, N> x_full)
-    {
-        mask<f32, N> inverse;
-        const vec<f32, N> x = trig_fold(x_full, inverse);
-
-        constexpr f32 tan_c2  = 0x5.555378p-4;
-        constexpr f32 tan_c4  = 0x2.225bb8p-4;
-        constexpr f32 tan_c6  = 0xd.ac3fep-8;
-        constexpr f32 tan_c8  = 0x6.41644p-8;
-        constexpr f32 tan_c10 = 0xc.bfe7ep-12;
-        constexpr f32 tan_c12 = 0x2.6754dp-8;
-
-        constexpr f32 cot_c2  = -0x5.555558p-4;
-        constexpr f32 cot_c4  = -0x5.b0581p-8;
-        constexpr f32 cot_c6  = -0x8.ac5ccp-12;
-        constexpr f32 cot_c8  = -0xd.aaa01p-16;
-        constexpr f32 cot_c10 = -0x1.a9a9b4p-16;
-        constexpr f32 cot_c12 = -0x6.f7d4dp-24;
-
-        const vec<f32, N> x2  = x * x;
-        const vec<f32, N> val = mask_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6,
-                                            tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12);
-
-        const vec<f32, N> z = select(inverse, val / -x, val * x);
-        return mulsign(z, x_full);
-    }
-
-    template <size_t N>
-    KFR_SINTRIN vec<f64, N> tan(vec<f64, N> x_full)
-    {
-        mask<f64, N> inverse;
-        const vec<f64, N> x = trig_fold(x_full, inverse);
-
-        constexpr f64 tan_c2  = 0x5.5555554d8e5b8p-4;
-        constexpr f64 tan_c4  = 0x2.222224820264p-4;
-        constexpr f64 tan_c6  = 0xd.d0d90de32b3e8p-8;
-        constexpr f64 tan_c8  = 0x5.99723bdcf5cacp-8;
-        constexpr f64 tan_c10 = 0x2.434a142e413ap-8;
-        constexpr f64 tan_c12 = 0xf.2b59061305efp-12;
-        constexpr f64 tan_c14 = 0x4.a12565071a664p-12;
-        constexpr f64 tan_c16 = 0x4.dada3797ac1bcp-12;
-        constexpr f64 tan_c18 = -0x1.a74976b6ea3f3p-12;
-        constexpr f64 tan_c20 = 0x1.d06a5ae5e4a74p-12;
-
-        constexpr f64 cot_c2  = -0x5.5555555555554p-4;
-        constexpr f64 cot_c4  = -0x5.b05b05b05b758p-8;
-        constexpr f64 cot_c6  = -0x8.ab355dffc79a8p-12;
-        constexpr f64 cot_c8  = -0xd.debbca405c9f8p-16;
-        constexpr f64 cot_c10 = -0x1.66a8edb99b15p-16;
-        constexpr f64 cot_c12 = -0x2.450239be0ee92p-20;
-        constexpr f64 cot_c14 = -0x3.ad6ddb4719438p-24;
-        constexpr f64 cot_c16 = -0x5.ff4c42741356p-28;
-        constexpr f64 cot_c18 = -0x9.06881bcdf3108p-32;
-        constexpr f64 cot_c20 = -0x1.644abedc113cap-32;
-
-        const vec<f64, N> x2 = x * x;
-        const vec<f64, N> val =
-            mask_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6, cot_c8, tan_c8,
-                        cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14, cot_c16, tan_c16, cot_c18,
-                        tan_c18, cot_c20, tan_c20);
-
-        const vec<f64, N> z = select(inverse, val / -x, val * x);
-        return mulsign(z, x_full);
-    }
-    template <typename T>
-    KFR_SINTRIN T tandeg(const T& x)
-    {
-        return tan(x * c_degtorad<T>);
-    }
-
-    KFR_HANDLE_SCALAR(tan)
-    KFR_SPEC_FN(in_tan, tan)
-    KFR_SPEC_FN(in_tan, tandeg)
-};
+    mask<f32, N> inverse;
+    const vec<f32, N> x = trig_fold_simple(x_full, inverse);
+
+    constexpr f32 tan_c2  = 0x5.555378p-4;
+    constexpr f32 tan_c4  = 0x2.225bb8p-4;
+    constexpr f32 tan_c6  = 0xd.ac3fep-8;
+    constexpr f32 tan_c8  = 0x6.41644p-8;
+    constexpr f32 tan_c10 = 0xc.bfe7ep-12;
+    constexpr f32 tan_c12 = 0x2.6754dp-8;
+
+    constexpr f32 cot_c2  = -0x5.555558p-4;
+    constexpr f32 cot_c4  = -0x5.b0581p-8;
+    constexpr f32 cot_c6  = -0x8.ac5ccp-12;
+    constexpr f32 cot_c8  = -0xd.aaa01p-16;
+    constexpr f32 cot_c10 = -0x1.a9a9b4p-16;
+    constexpr f32 cot_c12 = -0x6.f7d4dp-24;
+
+    const vec<f32, N> x2  = x * x;
+    const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6,
+                                        tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12);
+
+    const vec<f32, N> z = select(inverse, val / -x, val * x);
+    return mulsign(z, x_full);
 }
 
-namespace native
+template <size_t N>
+KFR_SINTRIN vec<f64, N> tan(vec<f64, N> x_full)
 {
-using fn_tan = internal::in_tan<>::fn_tan;
+    mask<f64, N> inverse;
+    const vec<f64, N> x = trig_fold_simple(x_full, inverse);
+
+    constexpr f64 tan_c2  = 0x5.5555554d8e5b8p-4;
+    constexpr f64 tan_c4  = 0x2.222224820264p-4;
+    constexpr f64 tan_c6  = 0xd.d0d90de32b3e8p-8;
+    constexpr f64 tan_c8  = 0x5.99723bdcf5cacp-8;
+    constexpr f64 tan_c10 = 0x2.434a142e413ap-8;
+    constexpr f64 tan_c12 = 0xf.2b59061305efp-12;
+    constexpr f64 tan_c14 = 0x4.a12565071a664p-12;
+    constexpr f64 tan_c16 = 0x4.dada3797ac1bcp-12;
+    constexpr f64 tan_c18 = -0x1.a74976b6ea3f3p-12;
+    constexpr f64 tan_c20 = 0x1.d06a5ae5e4a74p-12;
+
+    constexpr f64 cot_c2  = -0x5.5555555555554p-4;
+    constexpr f64 cot_c4  = -0x5.b05b05b05b758p-8;
+    constexpr f64 cot_c6  = -0x8.ab355dffc79a8p-12;
+    constexpr f64 cot_c8  = -0xd.debbca405c9f8p-16;
+    constexpr f64 cot_c10 = -0x1.66a8edb99b15p-16;
+    constexpr f64 cot_c12 = -0x2.450239be0ee92p-20;
+    constexpr f64 cot_c14 = -0x3.ad6ddb4719438p-24;
+    constexpr f64 cot_c16 = -0x5.ff4c42741356p-28;
+    constexpr f64 cot_c18 = -0x9.06881bcdf3108p-32;
+    constexpr f64 cot_c20 = -0x1.644abedc113cap-32;
+
+    const vec<f64, N> x2  = x * x;
+    const vec<f64, N> val = trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6,
+                                        cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14,
+                                        cot_c16, tan_c16, cot_c18, tan_c18, cot_c20, tan_c20);
+
+    const vec<f64, N> z = select(inverse, val / -x, val * x);
+    return mulsign(z, x_full);
+}
+template <typename T>
+KFR_SINTRIN T tandeg(const T& x)
+{
+    return tan(x * c_degtorad<T>);
+}
+
+KFR_HANDLE_SCALAR(tan)
+KFR_FN(tan)
+KFR_FN(tandeg)
+}
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> tan(const T1& x)
+KFR_INTRIN T1 tan(const T1& x)
 {
-    return internal::in_tan<>::tan(x);
+    return internal::tan(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_tan, E1> tan(E1&& x)
+KFR_INTRIN expr_func<internal::fn_tan, E1> tan(E1&& x)
 {
-    return { fn_tan(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 
-using fn_tandeg = internal::in_tan<>::fn_tandeg;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> tandeg(const T1& x)
+KFR_INTRIN T1 tandeg(const T1& x)
 {
-    return internal::in_tan<>::tandeg(x);
+    return internal::tandeg(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_tandeg, E1> tandeg(E1&& x)
+KFR_INTRIN expr_func<internal::fn_tandeg, E1> tandeg(E1&& x)
 {
-    return { fn_tandeg(), std::forward<E1>(x) };
+    return { {}, std::forward<E1>(x) };
 }
 }
-}
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/dft/conv.hpp b/include/kfr/dft/conv.hpp
@@ -27,7 +27,6 @@
 #include "../base/memory.hpp"
 #include "../base/read_write.hpp"
 #include "../base/vec.hpp"
-#include "../expressions/operators.hpp"
 
 #include "fft.hpp"
 
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -215,8 +215,8 @@ KFR_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
     else
     {
         double kth  = c_pi<double, 2> * (n / static_cast<double>(size));
-        double tcos = +kfr::native::cos(kth);
-        double tsin = -kfr::native::sin(kth);
+        double tcos = +kfr::cos(kth);
+        double tsin = -kfr::sin(kth);
         return make_vector(static_cast<T>(tcos), static_cast<T>(tsin));
     }
 }
diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp
@@ -485,7 +485,7 @@ constexpr cvec<T, N> twiddleimagmask()
 template <typename T, size_t N>
 KFR_NOINLINE static vec<T, N> cossin_conj(vec<T, N> x)
 {
-    return cconj(in_sin_cos<cpu_t::native>::cossin(x));
+    return cconj(cossin(x));
 }
 
 template <size_t k, size_t size, bool inverse = false, typename T, size_t width>
diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp
@@ -26,7 +26,6 @@
 #include "../base/sin_cos.hpp"
 #include "../base/vec.hpp"
 #include "../expressions/basic.hpp"
-#include "../expressions/operators.hpp"
 #include "../expressions/reduce.hpp"
 #include "window.hpp"
 
@@ -38,90 +37,72 @@ using fir_taps = univector<T, Size>;
 
 namespace internal
 {
-template <cpu_t cpu = cpu_t::native>
-struct in_fir : in_reduce<cpu>
+template <size_t tapcount, typename T, typename E1>
+struct expression_short_fir : expression<E1>
 {
-private:
-    using in_reduce<cpu>::dotproduct;
+    static_assert(is_poweroftwo(tapcount), "tapcount must be a power of two");
 
-public:
-    template <size_t tapcount, typename T, typename E1>
-    struct expression_short_fir : expression<E1>
+    expression_short_fir(E1&& e1, const array_ref<T>& taps)
+        : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(0)
     {
-        static_assert(is_poweroftwo(tapcount), "tapcount must be a power of two");
-        template <cpu_t newcpu>
-        using retarget_this =
-            typename in_fir<newcpu>::template expression_short_fir<tapcount, T, retarget<E1, newcpu>>;
-
-        expression_short_fir(E1&& e1, const array_ref<T>& taps)
-            : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(0)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
-        {
-            vec<T, N> in = cast<T>(this->argument_first(index, x));
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
+    {
+        vec<T, N> in = cast<T>(this->argument_first(index, x));
 
-            vec<T, N> out = in * taps[0];
-            cfor(csize<1>, csize<tapcount>,
-                 [&](auto I) { out = out + concat_and_slice<tapcount - 1 - I, N>(delayline, in) * taps[I]; });
-            delayline = concat_and_slice<N, tapcount - 1>(delayline, in);
+        vec<T, N> out = in * taps[0];
+        cfor(csize<1>, csize<tapcount>,
+             [&](auto I) { out = out + concat_and_slice<tapcount - 1 - I, N>(delayline, in) * taps[I]; });
+        delayline = concat_and_slice<N, tapcount - 1>(delayline, in);
 
-            return cast<U>(out);
-        }
-        vec<T, tapcount> taps;
-        mutable vec<T, tapcount - 1> delayline;
-    };
+        return cast<U>(out);
+    }
+    vec<T, tapcount> taps;
+    mutable vec<T, tapcount - 1> delayline;
+};
 
-    template <typename T, typename E1>
-    struct expression_fir : expression<E1>
+template <typename T, typename E1>
+struct expression_fir : expression<E1>
+{
+    expression_fir(E1&& e1, const array_ref<const T>& taps)
+        : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(taps.size(), T()), delayline_cursor(0)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
     {
-        template <cpu_t newcpu>
-        using retarget_this = typename in_fir<newcpu>::template expression_fir<T, retarget<E1, newcpu>>;
+        const size_t tapcount = taps.size();
+        const vec<T, N> input = cast<T>(this->argument_first(index, x));
 
-        expression_fir(E1&& e1, const array_ref<const T>& taps)
-            : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(taps.size(), T()),
-              delayline_cursor(0)
+        vec<T, N> output;
+        size_t cursor = delayline_cursor;
+        KFR_LOOP_NOUNROLL
+        for (size_t i = 0; i < N; i++)
         {
+            delayline.ringbuf_write(cursor, input[i]);
+            output(i) = dotproduct(taps, delayline.slice(cursor) /*, tapcount - cursor*/) +
+                        dotproduct(taps.slice(tapcount - cursor), delayline /*, cursor*/);
         }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
-        {
-            const size_t tapcount = taps.size();
-            const vec<T, N> input = cast<T>(this->argument_first(index, x));
-
-            vec<T, N> output;
-            size_t cursor = delayline_cursor;
-            KFR_LOOP_NOUNROLL
-            for (size_t i = 0; i < N; i++)
-            {
-                delayline.ringbuf_write(cursor, input[i]);
-                output(i) = dotproduct(taps, delayline.slice(cursor) /*, tapcount - cursor*/) +
-                            dotproduct(taps.slice(tapcount - cursor), delayline /*, cursor*/);
-            }
-            delayline_cursor = cursor;
-            return cast<U>(output);
-        }
-        univector_dyn<T> taps;
-        mutable univector_dyn<T> delayline;
-        mutable size_t delayline_cursor;
-    };
+        delayline_cursor = cursor;
+        return cast<U>(output);
+    }
+    univector_dyn<T> taps;
+    mutable univector_dyn<T> delayline;
+    mutable size_t delayline_cursor;
 };
 }
 
-namespace native
-{
 template <typename T, typename E1, size_t Tag>
-KFR_INLINE internal::in_fir<>::expression_fir<T, E1> fir(E1&& e1, const univector<T, Tag>& taps)
+KFR_INLINE internal::expression_fir<T, E1> fir(E1&& e1, const univector<T, Tag>& taps)
 {
-    return internal::in_fir<>::expression_fir<T, E1>(std::forward<E1>(e1), taps.ref());
+    return internal::expression_fir<T, E1>(std::forward<E1>(e1), taps.ref());
 }
 template <typename T, size_t TapCount, typename E1>
-KFR_INLINE internal::in_fir<>::expression_short_fir<TapCount, T, E1> short_fir(
-    E1&& e1, const univector<T, TapCount>& taps)
+KFR_INLINE internal::expression_short_fir<TapCount, T, E1> short_fir(E1&& e1,
+                                                                     const univector<T, TapCount>& taps)
 {
     static_assert(TapCount >= 1 && TapCount < 16, "Use short_fir only for small FIR filters");
-    return internal::in_fir<>::expression_short_fir<TapCount, T, E1>(std::forward<E1>(e1), taps.ref());
-}
+    return internal::expression_short_fir<TapCount, T, E1>(std::forward<E1>(e1), taps.ref());
 }
 }
diff --git a/include/kfr/dsp/fir_design.hpp b/include/kfr/dsp/fir_design.hpp
@@ -24,218 +24,124 @@
 
 #include "fir.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
 namespace internal
 {
-template <cpu_t cpu = cpu_t::native>
-struct in_fir_design : in_sqrt<cpu>,
-                       in_abs<cpu>,
-                       in_log_exp<cpu>,
-                       in_sin_cos<cpu>,
-                       in_window<cpu>,
-                       in_reduce<cpu>
+template <typename T>
+KFR_SINTRIN void fir_lowpass(univector_ref<T> taps, T cutoff, const expression_pointer<T>& window,
+                             bool normalize = true)
 {
-private:
-    using in_sqrt<cpu>::sqrt;
-    using in_abs<cpu>::abs;
-    using in_log_exp<cpu>::log;
-    using in_log_exp<cpu>::exp;
-    using in_log_exp<cpu>::log_fmadd;
-    using in_log_exp<cpu>::exp_fmadd;
-    using in_log_exp<cpu>::exp10;
-    using typename in_sin_cos<cpu>::fn_sinc;
-    using in_reduce<cpu>::reduce;
-    using in_reduce<cpu>::dotproduct;
-    using in_reduce<cpu>::sum;
-
-public:
-    template <typename T>
-    KFR_SINTRIN void fir_lowpass(univector_ref<T> taps, T cutoff, const expression_pointer<T>& window,
-                                 bool normalize = true)
+    const T scale = 2.0 * cutoff;
+    taps          = bind_expression(fn_sinc(),
+                           symmlinspace<T, true>((taps.size() - 1) * cutoff * c_pi<T>, taps.size(), true)) *
+           scale * window;
+
+    if (is_odd(taps.size()))
+        taps[taps.size() / 2] = scale;
+
+    if (normalize)
     {
-        const T scale = 2.0 * cutoff;
-        taps          = bind_expression(fn_sinc(), symmlinspace<T, true>((taps.size() - 1) * cutoff * c_pi<T>,
-                                                                taps.size(), true)) *
-               scale * window;
-
-        if (is_odd(taps.size()))
-            taps[taps.size() / 2] = scale;
-
-        if (normalize)
-        {
-            const T invsum = reciprocal(sum(taps));
-            taps           = taps * invsum;
-        }
+        const T invsum = reciprocal(sum(taps));
+        taps           = taps * invsum;
     }
-    template <typename T>
-    KFR_SINTRIN void fir_highpass(univector_ref<T> taps, T cutoff, const expression_pointer<T>& window,
-                                  bool normalize = true)
+}
+template <typename T>
+KFR_SINTRIN void fir_highpass(univector_ref<T> taps, T cutoff, const expression_pointer<T>& window,
+                              bool normalize = true)
+{
+    const T scale = 2.0 * -cutoff;
+    taps          = bind_expression(fn_sinc(),
+                           symmlinspace<T, true>((taps.size() - 1) * cutoff * c_pi<T>, taps.size(), true)) *
+           scale * window;
+
+    if (is_odd(taps.size()))
+        taps[taps.size() / 2] = 1 - 2.0 * cutoff;
+
+    if (normalize)
     {
-        const T scale = 2.0 * -cutoff;
-        taps          = bind_expression(fn_sinc(), symmlinspace<T, true>((taps.size() - 1) * cutoff * c_pi<T>,
-                                                                taps.size(), true)) *
-               scale * window;
-
-        if (is_odd(taps.size()))
-            taps[taps.size() / 2] = 1 - 2.0 * cutoff;
-
-        if (normalize)
-        {
-            const T invsum = reciprocal(sum(taps) + 1);
-            taps           = taps * invsum;
-        }
+        const T invsum = reciprocal(sum(taps) + 1);
+        taps           = taps * invsum;
     }
+}
+
+template <typename T>
+KFR_SINTRIN void fir_bandpass(univector_ref<T> taps, T frequency1, T frequency2,
+                              const expression_pointer<T>& window, bool normalize = true)
+{
+    const T scale1 = 2.0 * frequency1;
+    const T scale2 = 2.0 * frequency2;
+    const T sc     = c_pi<T> * T(taps.size() - 1);
+    const T start1 = sc * frequency1;
+    const T start2 = sc * frequency2;
+
+    taps = (bind_expression(fn_sinc(), symmlinspace<T, true>(start2, taps.size(), true)) * scale2 -
+            bind_expression(fn_sinc(), symmlinspace<T, true>(start1, taps.size(), true)) * scale1) *
+           window;
 
-    template <typename T>
-    KFR_SINTRIN void fir_bandpass(univector_ref<T> taps, T frequency1, T frequency2,
-                                  const expression_pointer<T>& window, bool normalize = true)
+    if (is_odd(taps.size()))
+        taps[taps.size() / 2] = 2 * (frequency2 - frequency1);
+
+    if (normalize)
     {
-        const T scale1 = 2.0 * frequency1;
-        const T scale2 = 2.0 * frequency2;
-        const T sc     = c_pi<T> * T(taps.size() - 1);
-        const T start1 = sc * frequency1;
-        const T start2 = sc * frequency2;
-
-        taps = (bind_expression(fn_sinc(), symmlinspace<T, true>(start2, taps.size(), true)) * scale2 -
-                bind_expression(fn_sinc(), symmlinspace<T, true>(start1, taps.size(), true)) * scale1) *
-               window;
-
-        if (is_odd(taps.size()))
-            taps[taps.size() / 2] = 2 * (frequency2 - frequency1);
-
-        if (normalize)
-        {
-            const T invsum = reciprocal(sum(taps) + 1);
-            taps           = taps * invsum;
-        }
+        const T invsum = reciprocal(sum(taps) + 1);
+        taps           = taps * invsum;
     }
+}
+
+template <typename T>
+KFR_SINTRIN void fir_bandstop(univector_ref<T> taps, T frequency1, T frequency2,
+                              const expression_pointer<T>& window, bool normalize = true)
+{
+    const T scale1 = 2.0 * frequency1;
+    const T scale2 = 2.0 * frequency2;
+    const T sc     = c_pi<T> * T(taps.size() - 1);
+    const T start1 = sc * frequency1;
+    const T start2 = sc * frequency2;
+
+    taps = (bind_expression(fn_sinc(), symmlinspace<T, true>(start1, taps.size(), true)) * scale1 -
+            bind_expression(fn_sinc(), symmlinspace<T, true>(start2, taps.size(), true)) * scale2) *
+           window;
+
+    if (is_odd(taps.size()))
+        taps[taps.size() / 2] = 1 - 2 * (frequency2 - frequency1);
 
-    template <typename T>
-    KFR_SINTRIN void fir_bandstop(univector_ref<T> taps, T frequency1, T frequency2,
-                                  const expression_pointer<T>& window, bool normalize = true)
+    if (normalize)
     {
-        const T scale1 = 2.0 * frequency1;
-        const T scale2 = 2.0 * frequency2;
-        const T sc     = c_pi<T> * T(taps.size() - 1);
-        const T start1 = sc * frequency1;
-        const T start2 = sc * frequency2;
-
-        taps = (bind_expression(fn_sinc(), symmlinspace<T, true>(start1, taps.size(), true)) * scale1 -
-                bind_expression(fn_sinc(), symmlinspace<T, true>(start2, taps.size(), true)) * scale2) *
-               window;
-
-        if (is_odd(taps.size()))
-            taps[taps.size() / 2] = 1 - 2 * (frequency2 - frequency1);
-
-        if (normalize)
-        {
-            const T invsum = reciprocal(sum(taps));
-            taps           = taps * invsum;
-        }
+        const T invsum = reciprocal(sum(taps));
+        taps           = taps * invsum;
     }
+}
 
-    template <size_t tapcount, typename T, typename E1>
-    struct expression_short_fir : expression<E1>
-    {
-        static_assert(is_poweroftwo(tapcount), "tapcount must be a power of two");
-        template <cpu_t newcpu>
-        using retarget_this =
-            typename in_fir<newcpu>::template expression_short_fir<tapcount, T, retarget<E1, newcpu>>;
-
-        expression_short_fir(E1&& e1, const array_ref<T>& taps)
-            : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(0)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
-        {
-            vec<T, N> in = cast<T>(this->argument_first(index, x));
-
-            vec<T, N> out = in * taps[0];
-            cfor(csize<1>, csize<tapcount>,
-                 [&](auto I) { out = out + concat_and_slice<tapcount - 1 - I, N>(delayline, in) * taps[I]; });
-            delayline = concat_and_slice<N, tapcount - 1>(delayline, in);
-
-            return cast<U>(out);
-        }
-        vec<T, tapcount> taps;
-        mutable vec<T, tapcount - 1> delayline;
-    };
-
-    template <typename T, typename E1>
-    struct expression_fir : expression<E1>
-    {
-        template <cpu_t newcpu>
-        using retarget_this = typename in_fir<newcpu>::template expression_fir<T, retarget<E1, newcpu>>;
-
-        expression_fir(E1&& e1, const array_ref<const T>& taps)
-            : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(taps.size(), T()),
-              delayline_cursor(0)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
-        {
-            const size_t tapcount = taps.size();
-            const vec<T, N> input = cast<T>(this->argument_first(index, x));
-
-            vec<T, N> output;
-            size_t cursor = delayline_cursor;
-            KFR_LOOP_NOUNROLL
-            for (size_t i = 0; i < N; i++)
-            {
-                delayline.ringbuf_write(cursor, input[i]);
-                output(i) = dotproduct(taps, delayline.slice(cursor) /*, tapcount - cursor*/) +
-                            dotproduct(taps.slice(tapcount - cursor), delayline /*, cursor*/);
-            }
-            delayline_cursor = cursor;
-            return cast<U>(output);
-        }
-        univector_dyn<T> taps;
-        mutable univector_dyn<T> delayline;
-        mutable size_t delayline_cursor;
-    };
-    KFR_SPEC_FN(in_fir_design, fir_lowpass)
-    KFR_SPEC_FN(in_fir_design, fir_highpass)
-    KFR_SPEC_FN(in_fir_design, fir_bandpass)
-    KFR_SPEC_FN(in_fir_design, fir_bandstop)
-};
+KFR_FN(fir_lowpass)
+KFR_FN(fir_highpass)
+KFR_FN(fir_bandpass)
+KFR_FN(fir_bandstop)
 }
 
-namespace native
-{
 template <typename T, size_t Tag>
 KFR_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
                             bool normalize = true)
 {
-    return internal::in_fir_design<>::fir_lowpass(taps.slice(), cutoff, window, normalize);
+    return internal::fir_lowpass(taps.slice(), cutoff, window, normalize);
 }
 template <typename T, size_t Tag>
 KFR_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
                              bool normalize = true)
 {
-    return internal::in_fir_design<>::fir_highpass(taps.slice(), cutoff, window, normalize);
+    return internal::fir_highpass(taps.slice(), cutoff, window, normalize);
 }
 template <typename T, size_t Tag>
 KFR_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
                              const expression_pointer<T>& window, bool normalize = true)
 {
-    return internal::in_fir_design<>::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize);
+    return internal::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize);
 }
 template <typename T, size_t Tag>
 KFR_INLINE void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
                              const expression_pointer<T>& window, bool normalize = true)
 {
-    return internal::in_fir_design<>::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize);
-}
+    return internal::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize);
 }
 }
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/dsp/fracdelay.hpp b/include/kfr/dsp/fracdelay.hpp
@@ -27,18 +27,14 @@
 namespace kfr
 {
 
-namespace native
-{
-
 template <typename T, typename E1>
-KFR_INLINE internal::in_fir<>::expression_short_fir<2, T, E1> fracdelay(E1&& e1, T delay)
+KFR_INLINE internal::expression_short_fir<2, T, E1> fracdelay(E1&& e1, T delay)
 {
     if (delay < 0)
         delay = 0;
     if (delay > 1)
         delay = fract(delay);
     univector<T, 2> taps({ 1 - delay, delay });
-    return internal::in_fir<>::expression_short_fir<2, T, E1>(std::forward<E1>(e1), taps.ref());
-}
+    return internal::expression_short_fir<2, T, E1>(std::forward<E1>(e1), taps.ref());
 }
 }
diff --git a/include/kfr/dsp/oscillators.hpp b/include/kfr/dsp/oscillators.hpp
@@ -23,28 +23,20 @@
 #pragma once
 
 #include "../base/sin_cos.hpp"
-#include "../base/vec.hpp"
 #include "../expressions/basic.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
 template <typename T>
 auto jaehne(T magn, size_t size)
 {
-    using namespace native;
     return typed<T>(magn * sin(c_pi<T, 1, 2> * sqr(linspace(T(0), T(size), size, false)) / size), size);
 }
 
 template <typename T>
 auto swept(T magn, size_t size)
 {
-    using namespace native;
     return typed<T>(
         magn * sin(c_pi<T, 1, 4> * sqr(sqr(linspace(T(0), T(size), size, false)) / sqr(T(size))) * T(size)),
         size);
@@ -52,277 +44,235 @@ auto swept(T magn, size_t size)
 
 namespace internal
 {
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_oscillators : in_sin_cos<cc>, in_select<cc>, in_round<cc>, in_abs<cc>
+template <typename T>
+KFR_SINTRIN T rawsine(T x)
 {
-private:
-    using in_sin_cos<cc>::fastsin;
-    using in_sin_cos<cc>::sin;
-    using in_select<cc>::select;
-    using in_round<cc>::fract;
-    using in_abs<cc>::abs;
-
-public:
-    template <typename T>
-    KFR_SINTRIN T rawsine(T x)
-    {
-        return fastsin(x * c_pi<T, 2>);
-    }
-    template <typename T>
-    KFR_SINTRIN T sinenorm(T x)
-    {
-        return rawsine(fract(x));
-    }
-    template <typename T>
-    KFR_SINTRIN T sine(T x)
-    {
-        return sinenorm(c_recip_pi<T, 1, 2> * x);
-    }
+    return fastsin(x * c_pi<T, 2>);
+}
+template <typename T>
+KFR_SINTRIN T sinenorm(T x)
+{
+    return rawsine(fract(x));
+}
+template <typename T>
+KFR_SINTRIN T sine(T x)
+{
+    return sinenorm(c_recip_pi<T, 1, 2> * x);
+}
 
-    template <typename T>
-    KFR_SINTRIN T rawsquare(T x)
-    {
-        return select(x < T(0.5), T(1), -T(1));
-    }
-    template <typename T>
-    KFR_SINTRIN T squarenorm(T x)
-    {
-        return rawsquare(fract(x));
-    }
-    template <typename T>
-    KFR_SINTRIN T square(T x)
-    {
-        return squarenorm(c_recip_pi<T, 1, 2> * x);
-    }
+template <typename T>
+KFR_SINTRIN T rawsquare(T x)
+{
+    return select(x < T(0.5), T(1), -T(1));
+}
+template <typename T>
+KFR_SINTRIN T squarenorm(T x)
+{
+    return rawsquare(fract(x));
+}
+template <typename T>
+KFR_SINTRIN T square(T x)
+{
+    return squarenorm(c_recip_pi<T, 1, 2> * x);
+}
 
-    template <typename T>
-    KFR_SINTRIN T rawsawtooth(T x)
-    {
-        return T(1) - 2 * x;
-    }
-    template <typename T>
-    KFR_SINTRIN T sawtoothnorm(T x)
-    {
-        return rawsawtooth(fract(x));
-    }
-    template <typename T>
-    KFR_SINTRIN T sawtooth(T x)
-    {
-        return sawtoothnorm(c_recip_pi<T, 1, 2> * x);
-    }
+template <typename T>
+KFR_SINTRIN T rawsawtooth(T x)
+{
+    return T(1) - 2 * x;
+}
+template <typename T>
+KFR_SINTRIN T sawtoothnorm(T x)
+{
+    return rawsawtooth(fract(x));
+}
+template <typename T>
+KFR_SINTRIN T sawtooth(T x)
+{
+    return sawtoothnorm(c_recip_pi<T, 1, 2> * x);
+}
 
-    template <typename T>
-    KFR_SINTRIN T isawtoothnorm(T x)
-    {
-        return T(-1) + 2 * fract(x + 0.5);
-    }
-    template <typename T>
-    KFR_SINTRIN T isawtooth(T x)
-    {
-        return isawtoothnorm(c_recip_pi<T, 1, 2> * x);
-    }
+template <typename T>
+KFR_SINTRIN T isawtoothnorm(T x)
+{
+    return T(-1) + 2 * fract(x + 0.5);
+}
+template <typename T>
+KFR_SINTRIN T isawtooth(T x)
+{
+    return isawtoothnorm(c_recip_pi<T, 1, 2> * x);
+}
 
-    template <typename T>
-    KFR_SINTRIN T rawtriangle(T x)
-    {
-        return 1 - abs(4 * x - 2);
-    }
-    template <typename T>
-    KFR_SINTRIN T trianglenorm(T x)
-    {
-        return rawtriangle(fract(x + 0.25));
-    }
-    template <typename T>
-    KFR_SINTRIN T triangle(T x)
-    {
-        return trianglenorm(c_recip_pi<T, 1, 2> * x);
-    }
+template <typename T>
+KFR_SINTRIN T rawtriangle(T x)
+{
+    return 1 - abs(4 * x - 2);
+}
+template <typename T>
+KFR_SINTRIN T trianglenorm(T x)
+{
+    return rawtriangle(fract(x + 0.25));
+}
+template <typename T>
+KFR_SINTRIN T triangle(T x)
+{
+    return trianglenorm(c_recip_pi<T, 1, 2> * x);
+}
 
-    KFR_SPEC_FN(in_oscillators, rawsine)
-    KFR_SPEC_FN(in_oscillators, sine)
-    KFR_SPEC_FN(in_oscillators, sinenorm)
-    KFR_SPEC_FN(in_oscillators, rawsquare)
-    KFR_SPEC_FN(in_oscillators, square)
-    KFR_SPEC_FN(in_oscillators, squarenorm)
-    KFR_SPEC_FN(in_oscillators, rawtriangle)
-    KFR_SPEC_FN(in_oscillators, triangle)
-    KFR_SPEC_FN(in_oscillators, trianglenorm)
-    KFR_SPEC_FN(in_oscillators, rawsawtooth)
-    KFR_SPEC_FN(in_oscillators, sawtooth)
-    KFR_SPEC_FN(in_oscillators, sawtoothnorm)
-    KFR_SPEC_FN(in_oscillators, isawtooth)
-    KFR_SPEC_FN(in_oscillators, isawtoothnorm)
-};
+KFR_FN(rawsine)
+KFR_FN(sine)
+KFR_FN(sinenorm)
+KFR_FN(rawsquare)
+KFR_FN(square)
+KFR_FN(squarenorm)
+KFR_FN(rawtriangle)
+KFR_FN(triangle)
+KFR_FN(trianglenorm)
+KFR_FN(rawsawtooth)
+KFR_FN(sawtooth)
+KFR_FN(sawtoothnorm)
+KFR_FN(isawtooth)
+KFR_FN(isawtoothnorm)
 }
 
-using fn_rawsine = internal::in_oscillators<>::fn_rawsine;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> rawsine(const T1& x)
+KFR_INTRIN T1 rawsine(const T1& x)
 {
-    return internal::in_oscillators<>::rawsine(x);
+    return internal::rawsine(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_rawsine, E1> rawsine(E1&& x)
+KFR_INTRIN expr_func<internal::fn_rawsine, E1> rawsine(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_sine = internal::in_oscillators<>::fn_sine;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sine(const T1& x)
+KFR_INTRIN T1 sine(const T1& x)
 {
-    return internal::in_oscillators<>::sine(x);
+    return internal::sine(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sine, E1> sine(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sine, E1> sine(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_sinenorm = internal::in_oscillators<>::fn_sinenorm;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sinenorm(const T1& x)
+KFR_INTRIN T1 sinenorm(const T1& x)
 {
-    return internal::in_oscillators<>::sinenorm(x);
+    return internal::sinenorm(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sinenorm, E1> sinenorm(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sinenorm, E1> sinenorm(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_rawsquare = internal::in_oscillators<>::fn_rawsquare;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> rawsquare(const T1& x)
+KFR_INTRIN T1 rawsquare(const T1& x)
 {
-    return internal::in_oscillators<>::rawsquare(x);
+    return internal::rawsquare(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_rawsquare, E1> rawsquare(E1&& x)
+KFR_INTRIN expr_func<internal::fn_rawsquare, E1> rawsquare(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_square = internal::in_oscillators<>::fn_square;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> square(const T1& x)
+KFR_INTRIN T1 square(const T1& x)
 {
-    return internal::in_oscillators<>::square(x);
+    return internal::square(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_square, E1> square(E1&& x)
+KFR_INTRIN expr_func<internal::fn_square, E1> square(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_squarenorm = internal::in_oscillators<>::fn_squarenorm;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> squarenorm(const T1& x)
+KFR_INTRIN T1 squarenorm(const T1& x)
 {
-    return internal::in_oscillators<>::squarenorm(x);
+    return internal::squarenorm(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_squarenorm, E1> squarenorm(E1&& x)
+KFR_INTRIN expr_func<internal::fn_squarenorm, E1> squarenorm(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_rawtriangle = internal::in_oscillators<>::fn_rawtriangle;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> rawtriangle(const T1& x)
+KFR_INTRIN T1 rawtriangle(const T1& x)
 {
-    return internal::in_oscillators<>::rawtriangle(x);
+    return internal::rawtriangle(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_rawtriangle, E1> rawtriangle(E1&& x)
+KFR_INTRIN expr_func<internal::fn_rawtriangle, E1> rawtriangle(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_triangle = internal::in_oscillators<>::fn_triangle;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> triangle(const T1& x)
+KFR_INTRIN T1 triangle(const T1& x)
 {
-    return internal::in_oscillators<>::triangle(x);
+    return internal::triangle(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_triangle, E1> triangle(E1&& x)
+KFR_INTRIN expr_func<internal::fn_triangle, E1> triangle(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_trianglenorm = internal::in_oscillators<>::fn_trianglenorm;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> trianglenorm(const T1& x)
+KFR_INTRIN T1 trianglenorm(const T1& x)
 {
-    return internal::in_oscillators<>::trianglenorm(x);
+    return internal::trianglenorm(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_trianglenorm, E1> trianglenorm(E1&& x)
+KFR_INTRIN expr_func<internal::fn_trianglenorm, E1> trianglenorm(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_rawsawtooth = internal::in_oscillators<>::fn_rawsawtooth;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> rawsawtooth(const T1& x)
+KFR_INTRIN T1 rawsawtooth(const T1& x)
 {
-    return internal::in_oscillators<>::rawsawtooth(x);
+    return internal::rawsawtooth(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_rawsawtooth, E1> rawsawtooth(E1&& x)
+KFR_INTRIN expr_func<internal::fn_rawsawtooth, E1> rawsawtooth(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_sawtooth = internal::in_oscillators<>::fn_sawtooth;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sawtooth(const T1& x)
+KFR_INTRIN T1 sawtooth(const T1& x)
 {
-    return internal::in_oscillators<>::sawtooth(x);
+    return internal::sawtooth(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sawtooth, E1> sawtooth(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sawtooth, E1> sawtooth(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_sawtoothnorm = internal::in_oscillators<>::fn_sawtoothnorm;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> sawtoothnorm(const T1& x)
+KFR_INTRIN T1 sawtoothnorm(const T1& x)
 {
-    return internal::in_oscillators<>::sawtoothnorm(x);
+    return internal::sawtoothnorm(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_sawtoothnorm, E1> sawtoothnorm(E1&& x)
+KFR_INTRIN expr_func<internal::fn_sawtoothnorm, E1> sawtoothnorm(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_isawtooth = internal::in_oscillators<>::fn_isawtooth;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> isawtooth(const T1& x)
+KFR_INTRIN T1 isawtooth(const T1& x)
 {
-    return internal::in_oscillators<>::isawtooth(x);
+    return internal::isawtooth(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_isawtooth, E1> isawtooth(E1&& x)
+KFR_INTRIN expr_func<internal::fn_isawtooth, E1> isawtooth(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_isawtoothnorm = internal::in_oscillators<>::fn_isawtoothnorm;
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> isawtoothnorm(const T1& x)
+KFR_INTRIN T1 isawtoothnorm(const T1& x)
 {
-    return internal::in_oscillators<>::isawtoothnorm(x);
+    return internal::isawtoothnorm(x);
 }
-
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_isawtoothnorm, E1> isawtoothnorm(E1&& x)
+KFR_INTRIN expr_func<internal::fn_isawtoothnorm, E1> isawtoothnorm(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
 }
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/dsp/resample.hpp b/include/kfr/dsp/resample.hpp
@@ -28,11 +28,6 @@
 #include "../expressions/reduce.hpp"
 #include "window.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 namespace resample_quality
@@ -45,200 +40,174 @@ constexpr csize_t<10> high{};
 
 namespace internal
 {
-template <cpu_t cc = cpu_t::native>
-struct in_resampling : in_sqrt<cc>, in_abs<cc>, in_log_exp<cc>, in_sin_cos<cc>, in_window<cc>, in_reduce<cc>
+template <typename T1, typename T2>
+KFR_SINTRIN T1 resample_blackman(T1 n, T2 a)
 {
-private:
-    using in_sqrt<cc>::sqrt;
-    using in_abs<cc>::abs;
-    using in_log_exp<cc>::log;
-    using in_log_exp<cc>::exp;
-    using in_log_exp<cc>::log_fmadd;
-    using in_log_exp<cc>::exp_fmadd;
-    using in_log_exp<cc>::exp10;
-    using in_sin_cos<cc>::cos;
-    using in_sin_cos<cc>::sinc;
-    using in_reduce<cc>::dotproduct;
-    using in_reduce<cc>::sum;
-
-public:
-    template <typename T1, typename T2>
-    static inline T1 blackman(T1 n, T2 a)
-    {
-        const T1 a0 = (1 - a) * 0.5;
-        const T1 a1 = 0.5;
-        const T1 a2 = a * 0.5;
-        n           = n * c_pi<T1, 2>;
-        return a0 - a1 * cos(n) + a2 * cos(2 * n);
-    }
+    const T1 a0 = (1 - a) * 0.5;
+    const T1 a1 = 0.5;
+    const T1 a2 = a * 0.5;
+    n           = n * c_pi<T1, 2>;
+    return a0 - a1 * cos(n) + a2 * cos(2 * n);
+}
 
-    template <typename T, size_t quality>
-    struct resampler
-    {
-        template <cpu_t newcpu>
-        using retarget_this = typename in_resampling<newcpu>::template resampler<T, quality>;
+template <typename T, size_t quality>
+struct resampler
+{
+    using itype = i64;
 
-        using itype = i64;
+    constexpr static itype depth = static_cast<itype>(1 << (quality + 1));
 
-        constexpr static itype depth = static_cast<itype>(1 << (quality + 1));
+    resampler(itype interpolation_factor, itype decimation_factor, T scale = T(1), T cutoff = 0.49)
+        : input_position(0), output_position(0)
+    {
+        const i64 gcf = gcd(interpolation_factor, decimation_factor);
+        interpolation_factor /= gcf;
+        decimation_factor /= gcf;
 
-        resampler(itype interpolation_factor, itype decimation_factor, T scale = T(1), T cutoff = 0.49)
-            : input_position(0), output_position(0)
-        {
-            const i64 gcf = gcd(interpolation_factor, decimation_factor);
-            interpolation_factor /= gcf;
-            decimation_factor /= gcf;
+        taps  = depth * interpolation_factor;
+        order = size_t(depth * interpolation_factor - 1);
 
-            taps  = depth * interpolation_factor;
-            order = size_t(depth * interpolation_factor - 1);
+        this->interpolation_factor = interpolation_factor;
+        this->decimation_factor    = decimation_factor;
 
-            this->interpolation_factor = interpolation_factor;
-            this->decimation_factor    = decimation_factor;
+        const itype halftaps = taps / 2;
+        filter               = univector<T>(size_t(taps), T());
+        delay                = univector<T>(size_t(depth), T());
 
-            const itype halftaps = taps / 2;
-            filter               = univector<T>(size_t(taps), T());
-            delay                = univector<T>(size_t(depth), T());
+        cutoff = cutoff / std::max(decimation_factor, interpolation_factor);
 
-            cutoff = cutoff / std::max(decimation_factor, interpolation_factor);
+        for (itype j = 0, jj = 0; j < taps; j++)
+        {
+            filter[size_t(j)] = scale * 2 * interpolation_factor * cutoff *
+                                sinc((jj - halftaps) * cutoff * c_pi<T, 2>) *
+                                resample_blackman(T(jj) / T(taps - 1), T(0.16));
+            jj += size_t(interpolation_factor);
+            if (jj >= taps)
+                jj = jj - taps + 1;
+        }
 
-            for (itype j = 0, jj = 0; j < taps; j++)
-            {
-                filter[size_t(j)] = scale * 2 * interpolation_factor * cutoff *
-                                    sinc((jj - halftaps) * cutoff * c_pi<T, 2>) *
-                                    blackman(T(jj) / T(taps - 1), T(0.16));
-                jj += size_t(interpolation_factor);
-                if (jj >= taps)
-                    jj = jj - taps + 1;
-            }
+        const T s = reciprocal(sum(filter)) * interpolation_factor;
+        filter    = filter * s;
+    }
+    KFR_INLINE size_t operator()(T* dest, size_t zerosize)
+    {
+        size_t outputsize   = 0;
+        const itype srcsize = itype(zerosize);
 
-            const T s = reciprocal(sum(filter)) * interpolation_factor;
-            filter    = filter * s;
-        }
-        KFR_INLINE size_t operator()(T* dest, size_t zerosize)
+        for (size_t i = 0;; i++)
         {
-            size_t outputsize   = 0;
-            const itype srcsize = itype(zerosize);
-
-            for (size_t i = 0;; i++)
+            const itype ii                 = itype(i) + output_position;
+            const itype workindex          = ii * (decimation_factor);
+            const itype workindex_rem      = workindex % (interpolation_factor);
+            const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
+            itype srcindex                 = workindex / (interpolation_factor);
+            srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
+            const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
+            srcindex                       = srcindex - (depth - 1);
+
+            if (srcindex + depth >= input_position + srcsize)
+                break;
+            outputsize++;
+
+            if (dest)
             {
-                const itype ii                 = itype(i) + output_position;
-                const itype workindex          = ii * (decimation_factor);
-                const itype workindex_rem      = workindex % (interpolation_factor);
-                const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
-                itype srcindex                 = workindex / (interpolation_factor);
-                srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
-                const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
-                srcindex                       = srcindex - (depth - 1);
-
-                if (srcindex + depth >= input_position + srcsize)
-                    break;
-                outputsize++;
-
-                if (dest)
+                if (srcindex >= input_position)
                 {
-                    if (srcindex >= input_position)
-                    {
-                        dest[i] = T(0);
-                    }
-                    else
-                    {
-                        const itype prev_count = input_position - srcindex;
-                        dest[i]                = dotproduct(delay.slice(size_t(depth - prev_count)), tap_ptr);
-                    }
+                    dest[i] = T(0);
+                }
+                else
+                {
+                    const itype prev_count = input_position - srcindex;
+                    dest[i]                = dotproduct(delay.slice(size_t(depth - prev_count)), tap_ptr);
                 }
             }
-            if (srcsize >= depth)
-            {
-                delay = zeros();
-            }
-            else
-            {
-                delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
-                delay.slice(size_t(depth - srcsize)) = zeros();
-            }
-
-            input_position += srcsize;
-            output_position += outputsize;
-            return outputsize;
         }
-        KFR_INLINE size_t operator()(T* dest, univector_ref<const T> src)
+        if (srcsize >= depth)
         {
-            size_t outputsize   = 0;
-            const itype srcsize = itype(src.size());
+            delay = zeros();
+        }
+        else
+        {
+            delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
+            delay.slice(size_t(depth - srcsize)) = zeros();
+        }
+
+        input_position += srcsize;
+        output_position += outputsize;
+        return outputsize;
+    }
+    KFR_INLINE size_t operator()(T* dest, univector_ref<const T> src)
+    {
+        size_t outputsize   = 0;
+        const itype srcsize = itype(src.size());
 
-            for (size_t i = 0;; i++)
+        for (size_t i = 0;; i++)
+        {
+            const itype ii                 = itype(i) + output_position;
+            const itype workindex          = ii * (decimation_factor);
+            const itype workindex_rem      = workindex % (interpolation_factor);
+            const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
+            itype srcindex                 = workindex / (interpolation_factor);
+            srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
+            const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
+            srcindex                       = srcindex - (depth - 1);
+
+            if (srcindex + depth >= input_position + srcsize)
+                break;
+            outputsize++;
+
+            if (dest)
             {
-                const itype ii                 = itype(i) + output_position;
-                const itype workindex          = ii * (decimation_factor);
-                const itype workindex_rem      = workindex % (interpolation_factor);
-                const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
-                itype srcindex                 = workindex / (interpolation_factor);
-                srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
-                const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
-                srcindex                       = srcindex - (depth - 1);
-
-                if (srcindex + depth >= input_position + srcsize)
-                    break;
-                outputsize++;
-
-                if (dest)
+                if (srcindex >= input_position)
                 {
-                    if (srcindex >= input_position)
-                    {
-                        dest[i] = dotproduct(src.slice(size_t(srcindex - input_position), size_t(depth)),
-                                             tap_ptr /*, depth*/);
-                    }
-                    else
-                    {
-                        const itype prev_count = input_position - srcindex;
-                        dest[i] =
-                            dotproduct(delay.slice(size_t(depth - prev_count)),
-                                       tap_ptr /*, size_t(prev_count)*/) +
-                            dotproduct(src, tap_ptr.slice(
-                                                size_t(prev_count),
-                                                size_t(depth - prev_count)) /*, size_t(depth - prev_count)*/);
-                    }
+                    dest[i] = dotproduct(src.slice(size_t(srcindex - input_position), size_t(depth)),
+                                         tap_ptr /*, depth*/);
+                }
+                else
+                {
+                    const itype prev_count = input_position - srcindex;
+                    dest[i] =
+                        dotproduct(delay.slice(size_t(depth - prev_count)),
+                                   tap_ptr /*, size_t(prev_count)*/) +
+                        dotproduct(
+                            src, tap_ptr.slice(size_t(prev_count),
+                                               size_t(depth - prev_count)) /*, size_t(depth - prev_count)*/);
                 }
             }
-            if (srcsize >= depth)
-            {
-                delay = src.slice(size_t(srcsize - depth));
-            }
-            else
-            {
-                delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
-                delay.slice(size_t(depth - srcsize)) = src;
-            }
-
-            input_position += srcsize;
-            output_position += outputsize;
-            return outputsize;
         }
-        itype taps;
-        size_t order;
-        itype interpolation_factor;
-        itype decimation_factor;
-        univector<T> filter;
-        univector<T> delay;
-        itype input_position;
-        itype output_position;
-    };
+        if (srcsize >= depth)
+        {
+            delay = src.slice(size_t(srcsize - depth));
+        }
+        else
+        {
+            delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
+            delay.slice(size_t(depth - srcsize)) = src;
+        }
+
+        input_position += srcsize;
+        output_position += outputsize;
+        return outputsize;
+    }
+    itype taps;
+    size_t order;
+    itype interpolation_factor;
+    itype decimation_factor;
+    univector<T> filter;
+    univector<T> delay;
+    itype input_position;
+    itype output_position;
 };
 }
 
-namespace native
-{
 template <typename T, size_t quality>
-inline internal::in_resampling<>::resampler<T, quality> resampler(csize_t<quality>,
+inline internal::resampler<T, quality> resampler(csize_t<quality>,
                                                                   size_t interpolation_factor,
                                                                   size_t decimation_factor, T scale = T(1),
                                                                   T cutoff = 0.49)
 {
-    using itype = typename internal::in_resampling<>::resampler<T, quality>::itype;
-    return internal::in_resampling<>::resampler<T, quality>(itype(interpolation_factor),
+    using itype = typename internal::resampler<T, quality>::itype;
+    return internal::resampler<T, quality>(itype(interpolation_factor),
                                                             itype(decimation_factor), scale, cutoff);
 }
 }
-}
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp
@@ -31,11 +31,6 @@
 #include "../base/vec.hpp"
 #include "../expressions/basic.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
@@ -43,177 +38,159 @@ using sample_rate_t = double;
 
 namespace internal
 {
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_dsp_units : in_log_exp<cc>, in_select<cc>, in_round<cc>, in_abs<cc>
-{
-private:
-    using in_log_exp<cc>::log;
-    using in_log_exp<cc>::exp;
-    using in_log_exp<cc>::log10;
-    using in_log_exp<cc>::exp10;
-    using in_log_exp<cc>::exp_fmadd;
-    using in_log_exp<cc>::log_fmadd;
-    using in_select<cc>::select;
-    using in_round<cc>::fract;
-    using in_abs<cc>::abs;
-
-public:
-    template <typename T, typename TF = ftype<T>>
-    KFR_SINTRIN TF amp_to_dB(T amp)
-    {
-        return log(cast<subtype<TF>>(amp)) * subtype<TF>(8.6858896380650365530225783783322);
-        // return T( 20.0 ) * log10( level );
-    }
-
-    template <typename T, typename TF = ftype<T>>
-    KFR_SINTRIN TF dB_to_amp(T dB)
-    {
-        return exp(dB * subtype<TF>(0.11512925464970228420089957273422));
-        // return exp10( dB / 20 );
-    }
-
-    template <typename T, typename TF = ftype<T>>
-    KFR_SINTRIN TF amp_to_dB(T amp, T offset)
-    {
-        return log_fmadd(amp, subtype<TF>(8.6858896380650365530225783783322), offset);
-        // return T( 20.0 ) * log10( level );
-    }
-
-    template <typename T, typename TF = ftype<T>>
-    KFR_SINTRIN TF dB_to_amp(T dB, T offset)
-    {
-        auto offs = -subtype<TF>(0.11512925464970228420089957273422) * offset;
-        return exp_fmadd(dB, subtype<TF>(0.11512925464970228420089957273422), offs);
-        // return exp10( dB / 20 );
-    }
-
-    template <typename T>
-    KFR_SINTRIN T power_to_dB(T x)
-    {
-        return log(x) * (10 * c_recip_log_10<T>);
-    }
-
-    template <typename T>
-    KFR_SINTRIN T dB_to_power(T x)
-    {
-        if (x == -c_infinity<T>)
-            return 0.0;
-        else
-            return exp(x * (c_log_10<T> / 10.0));
-    }
-
-    template <typename T, typename TF = ftype<T>>
-    KFR_SINTRIN TF note_to_hertz(T note)
-    {
-        const subtype<TF> offset = 2.1011784386926213177653145771814;
-
-        return exp_fmadd(note, subtype<TF>(0.05776226504666210911810267678818), offset);
-    }
-
-    template <typename T, typename TF = ftype<T>>
-    KFR_SINTRIN TF hertz_to_note(T hertz)
-    {
-        const subtype<TF> offset = -36.376316562295915248836189714583;
-
-        return log_fmadd(hertz, subtype<TF>(17.312340490667560888319096172023), offset);
-    }
-
-    template <typename T1, typename T2, typename T3, typename Tc = common_type<T1, T2, T3, f32>>
-    KFR_SINTRIN Tc note_to_hertz(T1 note, T2 tunenote, T3 tunehertz)
-    {
-        const Tc offset = log(tunehertz) - tunenote * subtype<Tc>(0.05776226504666210911810267678818);
-
-        return exp_fmadd(note, subtype<Tc>(0.05776226504666210911810267678818), offset);
-    }
-
-    template <typename T1, typename T2, typename T3, typename Tc = common_type<T1, T2, T3, f32>>
-    KFR_SINTRIN Tc hertz_to_note(T1 hertz, T2 tunenote, T3 tunehertz)
-    {
-        const Tc offset = tunenote - log(tunehertz) * subtype<Tc>(17.312340490667560888319096172023);
-
-        return log_fmadd(hertz, subtype<Tc>(17.312340490667560888319096172023), offset);
-    }
-
-    KFR_SPEC_FN(in_dsp_units, note_to_hertz)
-    KFR_SPEC_FN(in_dsp_units, hertz_to_note)
-    KFR_SPEC_FN(in_dsp_units, amp_to_dB)
-    KFR_SPEC_FN(in_dsp_units, dB_to_amp)
-    KFR_SPEC_FN(in_dsp_units, power_to_dB)
-    KFR_SPEC_FN(in_dsp_units, dB_to_power)
-};
-}
-
-using fn_note_to_hertz = internal::in_dsp_units<>::fn_note_to_hertz;
+template <typename T, typename TF = ftype<T>>
+KFR_SINTRIN TF amp_to_dB(T amp)
+{
+    return log(cast<subtype<TF>>(amp)) * subtype<TF>(8.6858896380650365530225783783322);
+    // return T( 20.0 ) * log10( level );
+}
+
+template <typename T, typename TF = ftype<T>>
+KFR_SINTRIN TF dB_to_amp(T dB)
+{
+    return exp(dB * subtype<TF>(0.11512925464970228420089957273422));
+    // return exp10( dB / 20 );
+}
+
+template <typename T, typename TF = ftype<T>>
+KFR_SINTRIN TF amp_to_dB2(T amp, T offset)
+{
+    return log_fmadd(amp, subtype<TF>(8.6858896380650365530225783783322), offset);
+    // return T( 20.0 ) * log10( level );
+}
+
+template <typename T, typename TF = ftype<T>>
+KFR_SINTRIN TF dB_to_amp(T dB, T offset)
+{
+    auto offs = -subtype<TF>(0.11512925464970228420089957273422) * offset;
+    return exp_fmadd(dB, subtype<TF>(0.11512925464970228420089957273422), offs);
+    // return exp10( dB / 20 );
+}
+
+template <typename T>
+KFR_SINTRIN T power_to_dB(T x)
+{
+    return log(x) * (10 * c_recip_log_10<T>);
+}
+
+template <typename T>
+KFR_SINTRIN T dB_to_power(T x)
+{
+    if (x == -c_infinity<T>)
+        return 0.0;
+    else
+        return exp(x * (c_log_10<T> / 10.0));
+}
+
+template <typename T, typename TF = ftype<T>>
+KFR_SINTRIN TF note_to_hertz(T note)
+{
+    const subtype<TF> offset = 2.1011784386926213177653145771814;
+
+    return exp_fmadd(note, subtype<TF>(0.05776226504666210911810267678818), offset);
+}
+
+template <typename T, typename TF = ftype<T>>
+KFR_SINTRIN TF hertz_to_note(T hertz)
+{
+    const subtype<TF> offset = -36.376316562295915248836189714583;
+
+    return log_fmadd(hertz, subtype<TF>(17.312340490667560888319096172023), offset);
+}
+
+template <typename T1, typename T2, typename T3, typename Tc = common_type<T1, T2, T3, f32>>
+KFR_SINTRIN Tc note_to_hertz(T1 note, T2 tunenote, T3 tunehertz)
+{
+    const Tc offset = log(tunehertz) - tunenote * subtype<Tc>(0.05776226504666210911810267678818);
+
+    return exp_fmadd(note, subtype<Tc>(0.05776226504666210911810267678818), offset);
+}
+
+template <typename T1, typename T2, typename T3, typename Tc = common_type<T1, T2, T3, f32>>
+KFR_SINTRIN Tc hertz_to_note(T1 hertz, T2 tunenote, T3 tunehertz)
+{
+    const Tc offset = tunenote - log(tunehertz) * subtype<Tc>(17.312340490667560888319096172023);
+
+    return log_fmadd(hertz, subtype<Tc>(17.312340490667560888319096172023), offset);
+}
+
+KFR_I_FN(note_to_hertz)
+KFR_I_FN(hertz_to_note)
+KFR_I_FN(amp_to_dB)
+KFR_I_FN(dB_to_amp)
+KFR_I_FN(power_to_dB)
+KFR_I_FN(dB_to_power)
+}
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> note_to_hertz(const T1& x)
+KFR_INTRIN T1 note_to_hertz(const T1& x)
 {
-    return internal::in_dsp_units<>::note_to_hertz(x);
+    return internal::note_to_hertz(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_note_to_hertz, E1> note_to_hertz(E1&& x)
+KFR_INTRIN expr_func<internal::fn_note_to_hertz, E1> note_to_hertz(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_hertz_to_note = internal::in_dsp_units<>::fn_hertz_to_note;
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> hertz_to_note(const T1& x)
+KFR_INTRIN T1 hertz_to_note(const T1& x)
 {
-    return internal::in_dsp_units<>::hertz_to_note(x);
+    return internal::hertz_to_note(x);
 }
+
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_hertz_to_note, E1> hertz_to_note(E1&& x)
+KFR_INTRIN expr_func<internal::fn_hertz_to_note, E1> hertz_to_note(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_amp_to_dB = internal::in_dsp_units<>::fn_amp_to_dB;
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> amp_to_dB(const T1& x)
+KFR_INTRIN T1 amp_to_dB(const T1& x)
 {
-    return internal::in_dsp_units<>::amp_to_dB(x);
+    return internal::amp_to_dB(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_amp_to_dB, E1> amp_to_dB(E1&& x)
+KFR_INTRIN expr_func<internal::fn_amp_to_dB, E1> amp_to_dB(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_dB_to_amp = internal::in_dsp_units<>::fn_dB_to_amp;
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> dB_to_amp(const T1& x)
+KFR_INTRIN T1 dB_to_amp(const T1& x)
 {
-    return internal::in_dsp_units<>::dB_to_amp(x);
+    return internal::dB_to_amp(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_dB_to_amp, E1> dB_to_amp(E1&& x)
+KFR_INTRIN expr_func<internal::fn_dB_to_amp, E1> dB_to_amp(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_power_to_dB = internal::in_dsp_units<>::fn_power_to_dB;
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> power_to_dB(const T1& x)
+KFR_INTRIN T1 power_to_dB(const T1& x)
 {
-    return internal::in_dsp_units<>::power_to_dB(x);
+    return internal::power_to_dB(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_power_to_dB, E1> power_to_dB(E1&& x)
+KFR_INTRIN expr_func<internal::fn_power_to_dB, E1> power_to_dB(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
-using fn_dB_to_power = internal::in_dsp_units<>::fn_dB_to_power;
+
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN ftype<T1> dB_to_power(const T1& x)
+KFR_INTRIN T1 dB_to_power(const T1& x)
 {
-    return internal::in_dsp_units<>::dB_to_power(x);
+    return internal::dB_to_power(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN expr_func<fn_dB_to_power, E1> dB_to_power(E1&& x)
+KFR_INTRIN expr_func<internal::fn_dB_to_power, E1> dB_to_power(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
 }
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/dsp/weighting.hpp b/include/kfr/dsp/weighting.hpp
@@ -31,92 +31,83 @@ namespace kfr
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_weight : in_sqrt<cc>, in_dsp_units<cc>
+template <typename T>
+KFR_SINTRIN T weight_a_unnorm(T f)
 {
-private:
-    using in_dsp_units<cc>::amp_to_dB;
-
-public:
-    template <typename T>
-    KFR_SINTRIN T weight_a_unnorm(T f)
-    {
-        const T f2  = pow2(f);
-        const T nom = pow2(12200) * pow4(f);
-        const T den =
-            (f2 + pow2(20.6)) * (sqrt((f2 + pow2(107.7)) * (f2 + pow2(737.9)))) * (f2 + pow2(12200));
-        return nom / den;
-    }
+    const T f2  = pow2(f);
+    const T nom = pow2(12200) * pow4(f);
+    const T den = (f2 + pow2(20.6)) * (sqrt((f2 + pow2(107.7)) * (f2 + pow2(737.9)))) * (f2 + pow2(12200));
+    return nom / den;
+}
 
-    template <typename T>
-    constexpr static T weight_a_gain = reciprocal(weight_a_unnorm(T(1000.0)));
+template <typename T>
+constexpr static T weight_a_gain = reciprocal(weight_a_unnorm(T(1000.0)));
 
-    template <typename T>
-    KFR_SINTRIN T aweighting(T f)
-    {
-        return weight_a_unnorm(f) * weight_a_gain<subtype<T>>;
-    }
+template <typename T>
+KFR_SINTRIN T aweighting(T f)
+{
+    return weight_a_unnorm(f) * weight_a_gain<subtype<T>>;
+}
 
-    template <typename T>
-    KFR_SINTRIN T weight_b_unnorm(T f)
-    {
-        const T f2  = pow2(f);
-        const T nom = pow2(12200) * pow3(f);
-        const T den = (f2 + pow2(20.6)) * (sqrt((f2 + pow2(158.5)))) * (f2 + pow2(12200));
+template <typename T>
+KFR_SINTRIN T weight_b_unnorm(T f)
+{
+    const T f2  = pow2(f);
+    const T nom = pow2(12200) * pow3(f);
+    const T den = (f2 + pow2(20.6)) * (sqrt((f2 + pow2(158.5)))) * (f2 + pow2(12200));
 
-        return nom / den;
-    }
+    return nom / den;
+}
 
-    template <typename T>
-    constexpr static T weight_b_gain = reciprocal(weight_b_unnorm(T(1000.0)));
+template <typename T>
+constexpr static T weight_b_gain = reciprocal(weight_b_unnorm(T(1000.0)));
 
-    template <typename T>
-    KFR_SINTRIN T bweighting(T f)
-    {
-        return weight_b_unnorm(f) * weight_b_gain<subtype<T>>;
-    }
+template <typename T>
+KFR_SINTRIN T bweighting(T f)
+{
+    return weight_b_unnorm(f) * weight_b_gain<subtype<T>>;
+}
 
-    template <typename T>
-    KFR_SINTRIN T weight_c_unnorm(T f)
-    {
-        const T f2  = pow2(f);
-        const T nom = pow2(12200) * f2;
-        const T den = (f2 + pow2(20.6)) * (f2 + pow2(12200));
+template <typename T>
+KFR_SINTRIN T weight_c_unnorm(T f)
+{
+    const T f2  = pow2(f);
+    const T nom = pow2(12200) * f2;
+    const T den = (f2 + pow2(20.6)) * (f2 + pow2(12200));
 
-        return nom / den;
-    }
+    return nom / den;
+}
 
-    template <typename T>
-    constexpr static T weight_c_gain = reciprocal(weight_c_unnorm(T(1000.0)));
+template <typename T>
+constexpr static T weight_c_gain = reciprocal(weight_c_unnorm(T(1000.0)));
 
-    template <typename T>
-    KFR_SINTRIN T cweighting(T f)
-    {
-        return weight_c_unnorm(f) * weight_c_gain<subtype<T>>;
-    }
+template <typename T>
+KFR_SINTRIN T cweighting(T f)
+{
+    return weight_c_unnorm(f) * weight_c_gain<subtype<T>>;
+}
 
-    template <typename T>
-    KFR_SINTRIN T aweightingdB(T f)
-    {
-        return amp_to_dB(aweighting(f));
-    }
-    template <typename T>
-    KFR_SINTRIN T bweightingdB(T f)
-    {
-        return amp_to_dB(bweighting(f));
-    }
-    template <typename T>
-    KFR_SINTRIN T cweightingdB(T f)
-    {
-        return amp_to_dB(cweighting(f));
-    }
+template <typename T>
+KFR_SINTRIN T aweightingdB(T f)
+{
+    return amp_to_dB(aweighting(f));
+}
+template <typename T>
+KFR_SINTRIN T bweightingdB(T f)
+{
+    return amp_to_dB(bweighting(f));
+}
+template <typename T>
+KFR_SINTRIN T cweightingdB(T f)
+{
+    return amp_to_dB(cweighting(f));
+}
 
-    KFR_SPEC_FN(in_weight, aweighting)
-    KFR_SPEC_FN(in_weight, bweighting)
-    KFR_SPEC_FN(in_weight, cweighting)
-    KFR_SPEC_FN(in_weight, aweightingdB)
-    KFR_SPEC_FN(in_weight, bweightingdB)
-    KFR_SPEC_FN(in_weight, cweightingdB)
-};
+KFR_FN(aweighting)
+KFR_FN(bweighting)
+KFR_FN(cweighting)
+KFR_FN(aweightingdB)
+KFR_FN(bweightingdB)
+KFR_FN(cweightingdB)
 }
 }
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -23,16 +23,12 @@
 #pragma once
 
 #include "../base/log_exp.hpp"
+#include "../base/modzerobessel.hpp"
 #include "../base/sin_cos.hpp"
 #include "../base/sqrt.hpp"
 #include "../base/vec.hpp"
 #include "../expressions/pointer.hpp"
 
-#pragma clang diagnostic push
-#if CID_HAS_WARNING("-Winaccessible-base")
-#pragma clang diagnostic ignored "-Winaccessible-base"
-#endif
-
 namespace kfr
 {
 
@@ -70,482 +66,406 @@ namespace internal
 {
 
 template <typename T>
-constexpr T bessel_coef[] = { T(0.25),
-                              T(0.027777777777777776236),
-                              T(0.0017361111111111110147),
-                              T(6.9444444444444444384e-005),
-                              T(1.9290123456790123911e-006),
-                              T(3.9367598891408417495e-008),
-                              T(6.1511873267825652335e-010),
-                              T(7.5940584281266239246e-012),
-                              T(7.5940584281266233693e-014),
-                              T(6.2760813455591932909e-016),
-                              T(4.3583898233049949985e-018),
-                              T(2.5789288895295827557e-020),
-                              T(1.3157800456783586208e-022),
-                              T(5.8479113141260384983e-025),
-                              T(2.2843403570804837884e-027),
-                              T(7.904291893012054025e-030),
-                              T(2.4395962632753252792e-032),
-                              T(6.75788438580422547e-035),
-                              T(1.689471096451056426e-037),
-                              T(3.8310002187098784929e-040),
-                              T(7.9152897080782616517e-043),
-                              T(1.4962740468957016443e-045),
-                              T(2.5976979980828152196e-048),
-                              T(4.1563167969325041577e-051),
-                              T(6.1483976285983795968e-054),
-                              T(8.434015951438105991e-057),
-                              T(1.0757673407446563809e-059),
-                              T(1.2791526049282476926e-062),
-                              T(1.4212806721424974034e-065),
-                              T(1.4789601166935457918e-068),
-                              T(1.4442969889585408123e-071),
-                              T(1.3262598613026086927e-074),
-                              T(1.1472836170437790782e-077),
-                              T(9.3655805472961564331e-081),
-                              T(7.2265282000741942594e-084),
-                              T(5.2786911614858977913e-087),
-                              T(3.6556032974279072401e-090),
-                              T(2.4034209713529963119e-093),
-                              T(1.5021381070956226783e-096) };
-
-template <typename T, size_t N>
-KFR_INLINE vec<T, N> modzerobessel(vec<T, N> x)
-{
-    const vec<T, N> x_2     = x * 0.5;
-    const vec<T, N> x_2_sqr = x_2 * x_2;
-    vec<T, N> num           = x_2_sqr;
-    vec<T, N> result;
-    result = 1 + x_2_sqr;
-
-    KFR_LOOP_UNROLL
-    for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
-    {
-        result = fmadd((num *= x_2_sqr), bessel_coef<T>[i], result);
-    }
-    return result;
-}
+struct window_linspace_0_1 : expression_linspace<T>
+{
+    window_linspace_0_1(size_t size, window_symmetry symmetry)
+        : expression_linspace<T>(0, 1, size, symmetry == window_symmetry::symmetric)
+    {
+    }
+};
+
+template <typename T>
+struct window_linspace_m1_1 : expression_linspace<T>
+{
+    window_linspace_m1_1(size_t size, window_symmetry symmetry)
+        : expression_linspace<T>(-1, 1, size, symmetry == window_symmetry::symmetric)
+    {
+    }
+};
+
+template <typename T>
+struct window_linspace_mpi_pi : expression_linspace<T>
+{
+    window_linspace_mpi_pi(size_t size, window_symmetry symmetry)
+        : expression_linspace<T>(-c_pi<T>, +c_pi<T>, size, symmetry == window_symmetry::symmetric)
+    {
+    }
+};
+
+template <typename T>
+struct window_linspace_m1_1_trunc : expression_linspace<T>
+{
+    window_linspace_m1_1_trunc(size_t size, window_symmetry symmetry)
+        : expression_linspace<T>(-T(size - 1) / size, T(size - 1) / size, size,
+                                 symmetry == window_symmetry::symmetric)
+    {
+    }
+};
+
+template <typename T>
+struct window_linspace_m1_1_trunc2 : expression_linspace<T>
+{
+    window_linspace_m1_1_trunc2(size_t size, window_symmetry symmetry)
+        : expression_linspace<T>(symmetric_linspace,
+                                 (size & 1) ? T(size - 1) / T(size + 1) : T(size - 1) / (size), size,
+                                 symmetry == window_symmetry::symmetric)
+    {
+    }
+};
 
-template <cpu_t cpu = cpu_t::native>
-struct in_window : in_sin_cos<cpu>, in_log_exp<cpu>, in_select<cpu>, in_sqrt<cpu>, in_abs<cpu>
+template <typename T>
+struct expression_rectangular : input_expression
 {
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_rectangular<T>;
+    expression_rectangular(size_t size, T = T(), window_symmetry = window_symmetry::symmetric) : m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        using UI = utype<U>;
+        const vec<UI, N> i = enumerate(vec<UI, N>()) + cast<UI>(index);
+        return select(i < cast<UI>(m_size), U(1), U(0));
+    }
+    size_t size() const { return m_size; }
+
 private:
-    using in_sin_cos<cpu>::sin;
-    using in_sin_cos<cpu>::cos;
-    using in_sin_cos<cpu>::sinc;
-    using in_log_exp<cpu>::exp;
-    using in_select<cpu>::select;
-    using in_sqrt<cpu>::sqrt;
-    using in_abs<cpu>::abs;
-
-public:
-    template <typename T>
-    struct window_linspace_0_1 : expression_linspace<T>
-    {
-        window_linspace_0_1(size_t size, window_symmetry symmetry)
-            : expression_linspace<T>(0, 1, size, symmetry == window_symmetry::symmetric)
-        {
-        }
-    };
+    size_t m_size;
+};
 
-    template <typename T>
-    struct window_linspace_m1_1 : expression_linspace<T>
+template <typename T>
+struct expression_triangular : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_triangular<T>;
+    expression_triangular(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
     {
-        window_linspace_m1_1(size_t size, window_symmetry symmetry)
-            : expression_linspace<T>(-1, 1, size, symmetry == window_symmetry::symmetric)
-        {
-        }
-    };
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(1 - abs(linspace(cinput, index, y)));
+    }
+    size_t size() const { return m_size; }
 
-    template <typename T>
-    struct window_linspace_mpi_pi : expression_linspace<T>
+private:
+    window_linspace_m1_1_trunc2<T> linspace;
+    size_t m_size;
+};
+
+template <typename T>
+struct expression_bartlett : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_bartlett<T>;
+    expression_bartlett(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
     {
-        window_linspace_mpi_pi(size_t size, window_symmetry symmetry)
-            : expression_linspace<T>(-c_pi<T>, +c_pi<T>, size, symmetry == window_symmetry::symmetric)
-        {
-        }
-    };
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(1 - abs(linspace(cinput, index, y)));
+    }
+    size_t size() const { return m_size; }
 
-    template <typename T>
-    struct window_linspace_m1_1_trunc : expression_linspace<T>
+private:
+    window_linspace_m1_1<T> linspace;
+    size_t m_size;
+};
+
+template <typename T>
+struct expression_cosine : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_cosine<T>;
+    expression_cosine(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
     {
-        window_linspace_m1_1_trunc(size_t size, window_symmetry symmetry)
-            : expression_linspace<T>(-T(size - 1) / size, T(size - 1) / size, size,
-                                     symmetry == window_symmetry::symmetric)
-        {
-        }
-    };
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(sin(c_pi<T> * linspace(cinput, index, y)));
+    }
+    size_t size() const { return m_size; }
+
+private:
+    window_linspace_0_1<T> linspace;
+    size_t m_size;
+};
 
-    template <typename T>
-    struct window_linspace_m1_1_trunc2 : expression_linspace<T>
+template <typename T>
+struct expression_hann : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_hann<T>;
+    expression_hann(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
     {
-        window_linspace_m1_1_trunc2(size_t size, window_symmetry symmetry)
-            : expression_linspace<T>(symmetric_linspace,
-                                     (size & 1) ? T(size - 1) / T(size + 1) : T(size - 1) / (size), size,
-                                     symmetry == window_symmetry::symmetric)
-        {
-        }
-    };
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y))));
+    }
+    size_t size() const { return m_size; }
 
-    template <typename T>
-    struct expression_rectangular : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_rectangular<T>;
-        expression_rectangular(size_t size, T = T(), window_symmetry = window_symmetry::symmetric)
-            : m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            using UI = utype<U>;
-            const vec<UI, N> i = enumerate(vec<UI, N>()) + cast<UI>(index);
-            return select(i < cast<UI>(m_size), U(1), U(0));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        size_t m_size;
-    };
+private:
+    window_linspace_0_1<T> linspace;
+    size_t m_size;
+};
 
-    template <typename T>
-    struct expression_triangular : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_triangular<T>;
-        expression_triangular(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(1 - abs(linspace(cinput, index, y)));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_m1_1_trunc2<T> linspace;
-        size_t m_size;
-    };
+template <typename T>
+struct expression_bartlett_hann : input_expression
+{
+    using value_type = T;
 
-    template <typename T>
-    struct expression_bartlett : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_bartlett<T>;
-        expression_bartlett(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(1 - abs(linspace(cinput, index, y)));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_m1_1<T> linspace;
-        size_t m_size;
-    };
+    template <cpu_t newcpu>
+    using retarget_this = expression_bartlett_hann<T>;
 
-    template <typename T>
-    struct expression_cosine : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_cosine<T>;
-        expression_cosine(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(sin(c_pi<T> * linspace(cinput, index, y)));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_0_1<T> linspace;
-        size_t m_size;
-    };
+    expression_bartlett_hann(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        const vec<T, N> xx = linspace(cinput, index, y);
+        return cast<U>(T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5))));
+    }
+    size_t size() const { return m_size; }
 
-    template <typename T>
-    struct expression_hann : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_hann<T>;
-        expression_hann(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y))));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_0_1<T> linspace;
-        size_t m_size;
-    };
+private:
+    window_linspace_0_1<T> linspace;
+    size_t m_size;
+};
 
-    template <typename T>
-    struct expression_bartlett_hann : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_bartlett_hann<T>;
-
-        expression_bartlett_hann(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            const vec<T, N> xx = linspace(cinput, index, y);
-            return cast<U>(T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5))));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_0_1<T> linspace;
-        size_t m_size;
-    };
+template <typename T>
+struct expression_hamming : input_expression
+{
+    using value_type = T;
 
-    template <typename T>
-    struct expression_hamming : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_hamming<T>;
-        expression_hamming(size_t size, T alpha = 0.54, window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), alpha(alpha), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(alpha - (1.0 - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y))));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_0_1<T> linspace;
-        T alpha;
-        size_t m_size;
-    };
+    template <cpu_t newcpu>
+    using retarget_this = expression_hamming<T>;
+    expression_hamming(size_t size, T alpha = 0.54, window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), alpha(alpha), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(alpha - (1.0 - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y))));
+    }
+    size_t size() const { return m_size; }
 
-    template <typename T>
-    struct expression_bohman : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_bohman<T>;
-        expression_bohman(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            const vec<U, N> n = abs(linspace(cinput, index, y));
-            return cast<U>((T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_m1_1<T> linspace;
-        size_t m_size;
-    };
+private:
+    window_linspace_0_1<T> linspace;
+    T alpha;
+    size_t m_size;
+};
 
-    template <typename T>
-    struct expression_blackman : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_blackman<T>;
-        expression_blackman(size_t size, T alpha = 0.16,
-                            window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), a0((1 - alpha) * 0.5), a1(0.5), a2(alpha * 0.5), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            const vec<T, N> n = linspace(cinput, index, y);
-            return cast<U>(a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_0_1<T> linspace;
-        T a0, a1, a2;
-        size_t m_size;
-    };
+template <typename T>
+struct expression_bohman : input_expression
+{
+    using value_type = T;
 
-    template <typename T>
-    struct expression_blackman_harris : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_blackman_harris<T>;
-        expression_blackman_harris(size_t size, T = T(),
-                                   window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
-
-            return cast<U>(T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) -
-                           T(0.01168) * cos(3 * n));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_0_1<T> linspace;
-        size_t m_size;
-    };
+    template <cpu_t newcpu>
+    using retarget_this = expression_bohman<T>;
+    expression_bohman(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        const vec<U, N> n = abs(linspace(cinput, index, y));
+        return cast<U>((T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n));
+    }
+    size_t size() const { return m_size; }
 
-    template <typename T>
-    struct expression_kaiser : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_kaiser<T>;
-        expression_kaiser(size_t size, T beta = 0.5, window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), beta(beta), m(reciprocal(modzerobessel(make_vector(beta))[0])),
-              m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m);
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_m1_1<T> linspace;
-        T beta;
-        T m;
-        size_t m_size;
-    };
+private:
+    window_linspace_m1_1<T> linspace;
+    size_t m_size;
+};
 
-    template <typename T>
-    struct expression_flattop : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_flattop<T>;
-        expression_flattop(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
-            constexpr T a0 = 1;
-            constexpr T a1 = 1.93;
-            constexpr T a2 = 1.29;
-            constexpr T a3 = 0.388;
-            constexpr T a4 = 0.028;
-            return cast<U>(a0 - a1 * cos(n) + a2 * cos(2 * n) - a3 * cos(3 * n) + a4 * cos(4 * n));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_0_1<T> linspace;
-        size_t m_size;
-    };
+template <typename T>
+struct expression_blackman : input_expression
+{
+    using value_type = T;
 
-    template <typename T>
-    struct expression_gaussian : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_gaussian<T>;
-
-        expression_gaussian(size_t size, T alpha = 2.5, window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), alpha(alpha), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(exp(-0.5 * sqr(alpha * linspace(cinput, index, y))));
-        }
-
-        size_t size() const { return m_size; }
-    private:
-        window_linspace_m1_1_trunc<T> linspace;
-        T alpha;
-        size_t m_size;
-    };
+    template <cpu_t newcpu>
+    using retarget_this = expression_blackman<T>;
+    expression_blackman(size_t size, T alpha = 0.16, window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), a0((1 - alpha) * 0.5), a1(0.5), a2(alpha * 0.5), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        const vec<T, N> n = linspace(cinput, index, y);
+        return cast<U>(a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n));
+    }
+    size_t size() const { return m_size; }
 
-    template <typename T>
-    struct expression_lanczos : input_expression
-    {
-        using value_type = T;
-
-        template <cpu_t newcpu>
-        using retarget_this = typename in_window<newcpu>::template expression_lanczos<T>;
-        expression_lanczos(size_t size, T alpha = 2.5, window_symmetry symmetry = window_symmetry::symmetric)
-            : linspace(size, symmetry), alpha(alpha), m_size(size)
-        {
-        }
-        template <typename U, size_t N>
-        KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-        {
-            constexpr vec_t<T, N> y{};
-            return cast<U>(sinc(linspace(cinput, index, y)));
-        }
-        size_t size() const { return m_size; }
-
-    private:
-        window_linspace_mpi_pi<T> linspace;
-        T alpha;
-        size_t m_size;
-    };
+private:
+    window_linspace_0_1<T> linspace;
+    T a0, a1, a2;
+    size_t m_size;
+};
+
+template <typename T>
+struct expression_blackman_harris : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_blackman_harris<T>;
+    expression_blackman_harris(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
+
+        return cast<U>(T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n));
+    }
+    size_t size() const { return m_size; }
+
+private:
+    window_linspace_0_1<T> linspace;
+    size_t m_size;
+};
+
+template <typename T>
+struct expression_kaiser : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_kaiser<T>;
+    expression_kaiser(size_t size, T beta = 0.5, window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), beta(beta), m(reciprocal(modzerobessel(make_vector(beta))[0])),
+          m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m);
+    }
+    size_t size() const { return m_size; }
+
+private:
+    window_linspace_m1_1<T> linspace;
+    T beta;
+    T m;
+    size_t m_size;
+};
+
+template <typename T>
+struct expression_flattop : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_flattop<T>;
+    expression_flattop(size_t size, T = T(), window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
+        constexpr T a0 = 1;
+        constexpr T a1 = 1.93;
+        constexpr T a2 = 1.29;
+        constexpr T a3 = 0.388;
+        constexpr T a4 = 0.028;
+        return cast<U>(a0 - a1 * cos(n) + a2 * cos(2 * n) - a3 * cos(3 * n) + a4 * cos(4 * n));
+    }
+    size_t size() const { return m_size; }
+
+private:
+    window_linspace_0_1<T> linspace;
+    size_t m_size;
+};
+
+template <typename T>
+struct expression_gaussian : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_gaussian<T>;
+
+    expression_gaussian(size_t size, T alpha = 2.5, window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), alpha(alpha), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(exp(-0.5 * sqr(alpha * linspace(cinput, index, y))));
+    }
+
+    size_t size() const { return m_size; }
+private:
+    window_linspace_m1_1_trunc<T> linspace;
+    T alpha;
+    size_t m_size;
+};
+
+template <typename T>
+struct expression_lanczos : input_expression
+{
+    using value_type = T;
+
+    template <cpu_t newcpu>
+    using retarget_this = expression_lanczos<T>;
+    expression_lanczos(size_t size, T alpha = 2.5, window_symmetry symmetry = window_symmetry::symmetric)
+        : linspace(size, symmetry), alpha(alpha), m_size(size)
+    {
+    }
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    {
+        constexpr vec_t<T, N> y{};
+        return cast<U>(sinc(linspace(cinput, index, y)));
+    }
+    size_t size() const { return m_size; }
+
+private:
+    window_linspace_mpi_pi<T> linspace;
+    T alpha;
+    size_t m_size;
 };
 
 template <window_type>
@@ -556,7 +476,7 @@ struct window_by_type;
     struct window_by_type<window_type::win>                                                                  \
     {                                                                                                        \
         template <typename T>                                                                                \
-        using type = in_window<>::expression_##win<T>;                                                       \
+        using type = expression_##win<T>;                                                                    \
     };
 KFR_WINDOW_BY_TYPE(rectangular)
 KFR_WINDOW_BY_TYPE(triangular)
@@ -574,83 +494,80 @@ KFR_WINDOW_BY_TYPE(gaussian)
 KFR_WINDOW_BY_TYPE(lanczos)
 }
 
-KFR_INLINE internal::in_window<>::expression_rectangular<fbase> window_rectangular(size_t size)
+KFR_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t size)
 {
-    return internal::in_window<>::expression_rectangular<fbase>(size, fbase());
+    return internal::expression_rectangular<fbase>(size, fbase());
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_triangular<T> window_triangular(size_t size,
-                                                                             ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_triangular<T>(size);
+    return internal::expression_triangular<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_bartlett<T> window_bartlett(size_t size,
-                                                                         ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_bartlett<T>(size);
+    return internal::expression_bartlett<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_cosine<T>(size);
+    return internal::expression_cosine<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_hann<T>(size);
+    return internal::expression_hann<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_bartlett_hann<T> window_bartlett_hann(size_t size,
-                                                                                   ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_bartlett_hann<T>(size);
+    return internal::expression_bartlett_hann<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_hamming<T> window_hamming(size_t size, T alpha = 0.54,
-                                                                       ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_hamming<T> window_hamming(size_t size, T alpha = 0.54,
+                                                           ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_hamming<T>(size, alpha);
+    return internal::expression_hamming<T>(size, alpha);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_bohman<T>(size);
+    return internal::expression_bohman<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_blackman<T> window_blackman(
+KFR_INLINE internal::expression_blackman<T> window_blackman(
     size_t size, T alpha = 0.16, window_symmetry symmetry = window_symmetry::symmetric,
     ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_blackman<T>(size, alpha, symmetry);
+    return internal::expression_blackman<T>(size, alpha, symmetry);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_blackman_harris<T> window_blackman_harris(
+KFR_INLINE internal::expression_blackman_harris<T> window_blackman_harris(
     size_t size, window_symmetry symmetry = window_symmetry::symmetric, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_blackman_harris<T>(size, T(), symmetry);
+    return internal::expression_blackman_harris<T>(size, T(), symmetry);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_kaiser<T> window_kaiser(size_t size, T beta = T(0.5),
-                                                                     ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, T beta = T(0.5),
+                                                         ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_kaiser<T>(size, beta);
+    return internal::expression_kaiser<T>(size, beta);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_flattop<T>(size);
+    return internal::expression_flattop<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_gaussian<T> window_gaussian(size_t size, T alpha = 2.5,
-                                                                         ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, T alpha = 2.5,
+                                                             ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_gaussian<T>(size, alpha);
+    return internal::expression_gaussian<T>(size, alpha);
 }
 template <typename T = fbase>
-KFR_INLINE internal::in_window<>::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_INLINE internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
 {
-    return internal::in_window<>::expression_lanczos<T>(size);
+    return internal::expression_lanczos<T>(size);
 }
 
 template <typename T           = fbase, window_type type,
@@ -681,5 +598,3 @@ KFR_NOINLINE expression_pointer<T> window(size_t size, window_type type, T win_p
         fn_returns<expression_pointer<T>>());
 }
 }
-
-#pragma clang diagnostic pop
diff --git a/include/kfr/expressions/reduce.hpp b/include/kfr/expressions/reduce.hpp
@@ -34,192 +34,99 @@ namespace kfr
 template <typename T>
 KFR_INLINE T final_mean(T value, size_t size)
 {
-    return value / size;
+    return value / T(size);
 }
 KFR_FN(final_mean)
 
 template <typename T>
 KFR_INLINE T final_rootmean(T value, size_t size)
 {
-    return internal::builtin_sqrt(value / size);
+    return internal::builtin_sqrt(value / T(size));
 }
 KFR_FN(final_rootmean)
 
 namespace internal
 {
-template <typename FinalFn, typename T, KFR_ENABLE_IF(is_callable<FinalFn, size_t, T>::value)>
+template <typename FinalFn, typename T, KFR_ENABLE_IF(is_callable<FinalFn, T, size_t>::value)>
 KFR_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t size, T value)
 {
     return finalfn(value, size);
 }
-template <typename FinalFn, typename T, KFR_ENABLE_IF(!is_callable<FinalFn, size_t, T>::value)>
+template <typename FinalFn, typename T, KFR_ENABLE_IF(!is_callable<FinalFn, T, size_t>::value)>
 KFR_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
 {
     return finalfn(value);
 }
 
-template <cpu_t cpu = cpu_t::native>
-struct in_reduce
+template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, cpu_t cpu = cpu_t::native>
+struct expression_reduce : output_expression
 {
+    constexpr static size_t width = vector_width<T, cpu> * bitness_const(1, 2);
 
-    template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn>
-    struct expression_reduce : output_expression
+    expression_reduce(ReduceFn&& reducefn, TransformFn&& transformfn, FinalFn&& finalfn)
+        : counter(0), reducefn(std::move(reducefn)), transformfn(std::move(transformfn)),
+          finalfn(std::move(finalfn)), value(resize<width>(make_vector(reducefn(initialvalue<T>{}))))
     {
-        using Tsubtype                = subtype<T>;
-        constexpr static size_t width = vector_width<Tsubtype, cpu> * bitness_const(1, 2);
-
-        expression_reduce(ReduceFn&& reducefn, TransformFn&& transformfn, FinalFn&& finalfn)
-            : counter(0), reducefn(std::move(reducefn)), transformfn(std::move(transformfn)),
-              finalfn(std::move(finalfn)), value(resize<width>(make_vector(reducefn(initialvalue<T>{}))))
-        {
-        }
-
-        template <typename U, size_t N>
-        KFR_INLINE void operator()(coutput_t, size_t, vec<U, N> x) const
-        {
-            counter += N;
-            process(x);
-        }
-
-        KFR_INLINE T get()
-        {
-            return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn));
-        }
-
-    protected:
-        void reset() { counter = 0; }
-        template <size_t N, KFR_ENABLE_IF(N == width)>
-        KFR_INLINE void process(vec<Tsubtype, N> x) const
-        {
-            value = reducefn(transformfn(x), value);
-        }
-
-        template <size_t N, KFR_ENABLE_IF(N < width)>
-        KFR_INLINE void process(vec<Tsubtype, N> x) const
-        {
-            value = combine(value, reducefn(transformfn(x), narrow<N>(value)));
-        }
-
-        template <size_t N, KFR_ENABLE_IF(N > width)>
-        KFR_INLINE void process(vec<Tsubtype, N> x) const
-        {
-            process(low(x));
-            process(high(x));
-        }
-
-        mutable size_t counter;
-        retarget<ReduceFn, cpu> reducefn;
-        retarget<TransformFn, cpu> transformfn;
-        retarget<FinalFn, cpu> finalfn;
-        mutable vec<Tsubtype, width> value;
-    };
-
-    template <typename ReduceFn, typename TransformFn = fn_pass_through, typename FinalFn = fn_pass_through,
-              typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T reduce(E1&& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn_pass_through(),
-                         FinalFn&& finalfn = fn_pass_through())
-    {
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        const size_t size = e1.size();
-        using reducer_t   = expression_reduce<T, decay<ReduceFn>, decay<TransformFn>, decay<FinalFn>>;
-        reducer_t red(std::forward<ReduceFn>(reducefn), std::forward<TransformFn>(transformfn),
-                      std::forward<FinalFn>(finalfn));
-        process<T, cpu>(red, std::forward<E1>(e1), size);
-
-        return red.get();
-    }
-
-    template <typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T sum(E1&& x)
-    {
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        return reduce(std::forward<E1>(x), fn_add());
     }
 
-    template <typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T mean(E1&& x)
+    template <typename U, size_t N>
+    KFR_INLINE void operator()(coutput_t, size_t, vec<U, N> x) const
     {
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        return reduce(std::forward<E1>(x), fn_add(), fn_pass_through(), fn_final_mean());
+        counter += N;
+        process(x);
     }
 
-    template <typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T min(E1&& x)
-    {
-        using fn_min = typename in_min_max<cpu>::fn_min;
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        return reduce(std::forward<E1>(x), fn_min());
-    }
+    KFR_INLINE T get() { return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn)); }
 
-    template <typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T max(E1&& x)
-    {
-        using fn_max = typename in_min_max<cpu>::fn_max;
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        return reduce(std::forward<E1>(x), fn_max());
-    }
+protected:
+    void reset() { counter = 0; }
+    KFR_INLINE void process(vec<T, width> x) const { value = reducefn(transformfn(x), value); }
 
-    template <typename E1, typename E2,
-              typename T = value_type_of<decltype(std::declval<E1>() * std::declval<E2>())>>
-    KFR_SINTRIN T dotproduct(E1&& x, E2&& y)
+    template <size_t N, KFR_ENABLE_IF(N < width)>
+    KFR_INLINE void process(vec<T, N> x) const
     {
-        auto m    = std::forward<E1>(x) * std::forward<E2>(y);
-        using E12 = decltype(m);
-        static_assert(!is_generic<E12>::value, "e1 * e2 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E12>::value, "e1 * e2 must be a sized expression (use typed<T>())");
-        return reduce(std::move(m), fn_add());
+        value = combine(value, reducefn(transformfn(x), narrow<N>(value)));
     }
 
-    template <typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T rms(E1&& x)
+    template <size_t N, KFR_ENABLE_IF(N > width)>
+    KFR_INLINE void process(vec<T, N> x) const
     {
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        return reduce(std::forward<E1>(x), fn_add(), fn_sqr(), fn_final_rootmean());
+        process(low(x));
+        process(high(x));
     }
 
-    template <typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T sumsqr(E1&& x)
-    {
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        return reduce(std::forward<E1>(x), fn_add(), fn_sqr());
-    }
+    mutable size_t counter;
+    retarget<ReduceFn, cpu> reducefn;
+    retarget<TransformFn, cpu> transformfn;
+    retarget<FinalFn, cpu> finalfn;
+    mutable vec<T, width> value;
+};
 
-    template <typename E1, typename T = value_type_of<E1>>
-    KFR_SINTRIN T product(E1&& x)
-    {
-        static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-        static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-        return reduce(std::forward<E1>(x), fn_mul());
-    }
+template <typename ReduceFn, typename TransformFn = fn_pass_through, typename FinalFn = fn_pass_through,
+          typename E1, typename T = value_type_of<E1>>
+KFR_SINTRIN T reduce(E1&& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn_pass_through(),
+                     FinalFn&& finalfn = fn_pass_through())
+{
+    static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
+    static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
+    const size_t size = e1.size();
+    using reducer_t   = expression_reduce<T, decay<ReduceFn>, decay<TransformFn>, decay<FinalFn>>;
+    reducer_t red(std::forward<ReduceFn>(reducefn), std::forward<TransformFn>(transformfn),
+                  std::forward<FinalFn>(finalfn));
+    process<T>(red, std::forward<E1>(e1), size);
 
-    KFR_SPEC_FN(in_reduce, reduce)
-    KFR_SPEC_FN(in_reduce, sum)
-    KFR_SPEC_FN(in_reduce, dotproduct)
-    KFR_SPEC_FN(in_reduce, rms)
-    KFR_SPEC_FN(in_reduce, sumsqr)
-    KFR_SPEC_FN(in_reduce, mean)
-    KFR_SPEC_FN(in_reduce, min)
-    KFR_SPEC_FN(in_reduce, max)
-    KFR_SPEC_FN(in_reduce, product)
-};
+    return red.get();
 }
 
-namespace native
-{
+KFR_FN(reduce)
+}
 
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
 KFR_SINTRIN T sum(E1&& x)
 {
     static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::sum(std::forward<E1>(x));
+    return internal::reduce(std::forward<E1>(x), fn_add());
 }
 
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
@@ -227,32 +134,50 @@ KFR_SINTRIN T mean(E1&& x)
 {
     static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::mean(std::forward<E1>(x));
+    return internal::reduce(std::forward<E1>(x), fn_add(), fn_pass_through(), fn_final_mean());
 }
 
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T max(E1&& x)
+KFR_SINTRIN T minof(E1&& x)
 {
     static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::max(std::forward<E1>(x));
+    return internal::reduce(std::forward<E1>(x), internal::fn_min());
 }
 
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T min(E1&& x)
+KFR_SINTRIN T maxof(E1&& x)
 {
     static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::min(std::forward<E1>(x));
+    return internal::reduce(std::forward<E1>(x), internal::fn_max());
+}
+
+template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_SINTRIN T absminof(E1&& x)
+{
+    static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
+    static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
+    return internal::reduce(std::forward<E1>(x), internal::fn_absmin());
+}
+
+template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_SINTRIN T absmaxof(E1&& x)
+{
+    static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
+    static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
+    return internal::reduce(std::forward<E1>(x), internal::fn_absmax());
 }
 
 template <typename E1, typename E2, typename T = value_type_of<E1>,
           KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
 KFR_SINTRIN T dotproduct(E1&& x, E2&& y)
 {
-    static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
-    static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::dotproduct(std::forward<E1>(x), std::forward<E2>(y));
+    auto m    = std::forward<E1>(x) * std::forward<E2>(y);
+    using E12 = decltype(m);
+    static_assert(!is_generic<E12>::value, "e1 must be a typed expression (use typed<T>())");
+    static_assert(!is_infinite<E12>::value, "e1 must be a sized expression (use typed<T>())");
+    return internal::reduce(std::move(m), fn_add());
 }
 
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
@@ -260,7 +185,7 @@ KFR_SINTRIN T rms(E1&& x)
 {
     static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::rms(std::forward<E1>(x));
+    return internal::reduce(std::forward<E1>(x), fn_add(), fn_sqr(), fn_final_rootmean());
 }
 
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
@@ -268,7 +193,7 @@ KFR_SINTRIN T sumsqr(E1&& x)
 {
     static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::sumsqr(std::forward<E1>(x));
+    return internal::reduce(std::forward<E1>(x), fn_add(), fn_sqr());
 }
 
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
@@ -276,7 +201,6 @@ KFR_SINTRIN T product(E1&& x)
 {
     static_assert(!is_generic<E1>::value, "e1 must be a typed expression (use typed<T>())");
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use typed<T>())");
-    return internal::in_reduce<>::product(std::forward<E1>(x));
-}
+    return internal::reduce(std::forward<E1>(x), fn_mul());
 }
 }
diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp
@@ -45,7 +45,3 @@
 #include "base/sqrt.hpp"
 #include "base/tan.hpp"
 
-namespace kfr
-{
-using namespace native;
-}
diff --git a/sources.cmake b/sources.cmake
@@ -20,6 +20,7 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/base/abs.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/asin_acos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/atan.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/clamp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/complex.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/constants.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/digitreverse.hpp
@@ -31,6 +32,7 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/base/logical.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/min_max.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/modzerobessel.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/operators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/read_write.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/round.hpp
@@ -71,7 +73,6 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/expressions/basic.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/expressions/conversion.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/expressions/generators.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/expressions/operators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/expressions/pointer.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/expressions/reduce.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/audiofile.hpp
diff --git a/tests/basic_vector_test.cpp b/tests/basic_vector_test.cpp
@@ -11,142 +11,97 @@
 #include <kfr/vec.hpp>
 #include <kfr/version.hpp>
 
+#include "testo/testo.hpp"
+
 using namespace kfr;
-using namespace kfr::native;
 
-template <typename T>
-void print_type(const T& value)
+TEST(test)
 {
-    println(type_name(value), ":");
-    println(value);
-}
-
-int main(int /*argc*/, char** /*argv*/)
-{
-    println(library_version());
-    // >>> KFR ...
-
     // How to make a vector:
 
     // * Use constructor
     const vec<double, 4> first{ 1, 2.5, -infinity, 3.1415926 };
-    print_type(first);
-    // >>> kfr::vec<double, 4>:
-    // >>>         1       2.5      -inf   3.14159
+    CHECK(first == vec<double, 4>{ 1, 2.5, -infinity, 3.1415926 });
 
     // * Use make_vector function
     const auto second = make_vector(-1, +1);
-    print_type(second);
-    // >>> kfr::vec<int, 2>:
-    // >>>        -1         1
+    CHECK(second == vec<int, 2>{ -1, 1 });
 
     // * Convert from vector of other type:
     const vec<int, 4> int_vector{ 10, 20, 30, 40 };
     const vec<double, 4> double_vector = cast<double>(int_vector);
-    print_type(double_vector);
-    // >>> kfr::vec<double, 4>:
-    // >>>        10        20        30        40
+    CHECK(double_vector == vec<double, 4>{ 10, 20, 30, 40 });
 
     // * Concat two vectors:
     const vec<int, 1> left_part{ 1 };
     const vec<int, 1> right_part{ 2 };
     const vec<int, 2> pair{ left_part, right_part };
-    print_type(pair);
-    // >>> kfr::vec<int, 2>:
-    // >>>         1         2
+    CHECK(pair == vec<int, 2>{ 1, 2 });
 
     // * Same, but using make_vector and concat:
     const vec<int, 2> pair2 = concat(make_vector(10), make_vector(20));
-    print_type(pair2);
-    // >>> kfr::vec<int, 2>:
-    // >>>        10        20
+    CHECK(pair2 == vec<int, 2>{ 10, 20 });
 
     // * Repeat vector multiple times:
     const vec<short, 8> repeated = repeat<4>(make_vector<short>(0, -1));
-    print_type(repeated);
-    // >>> kfr::vec<short, 8>:
-    // >>>         0        -1         0        -1         0        -1         0        -1
+    CHECK(repeated == vec<short, 8>{ 0, -1, 0, -1, 0, -1, 0, -1 });
 
     // * Use enumerate to generate sequence of numbers:
     const vec<int, 8> eight = enumerate<int, 8>();
-    print_type(eight);
-    // >>> kfr::vec<int, 8>:
-    // >>>         0         1         2         3         4         5         6         7
+    CHECK(eight == vec<int, 8>{ 0, 1, 2, 3, 4, 5, 6, 7 });
 
     // * Vectors can be of any length...
     const vec<int, 1> one{ 42 };
     const vec<int, 2> two = concat(one, make_vector(42));
-    print_type(two);
-    // >>> kfr::vec<int, 2>:
-    // >>>        42        42
+    CHECK(two == vec<int, 2>{ 42, 42 });
 
     const vec<u8, 256> very_long_vector = repeat<64>(make_vector<u8>(1, 2, 4, 8));
-    print_type(slice<0, 17>(very_long_vector));
-    // >>> kfr::vec<unsigned char, 17>:
-    // >>>         1         2         4         8         1         2         4         8
-    // >>>         1         2         4         8         1         2         4         8
-    // >>>         1
+    CHECK(slice<0, 17>(very_long_vector) ==
+          vec<unsigned char, 17>{ 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1 });
 
     // * ...really any:
     using big_vector = vec<i16, 107>;
     big_vector v107  = enumerate<i16, 107>();
-    print_type(hadd(v107));
-    // >>> short:
-    // >>> 5671
+    CHECK(hadd(v107) == static_cast<short>(5671));
 
     using color       = vec<u8, 3>;
     const color green = cast<u8>(make_vector(0.0, 1.0, 0.0) * 255);
-    print_type(green);
-    // >>> kfr::vec<unsigned char, 3>:
-    // >>>         0       255         0
+    CHECK(green == vec<unsigned char, 3>{ 0, 255, 0 });
 
     // Vectors support all standard operators:
     const auto op1    = make_vector(0, 1, 10, 100);
     const auto op2    = make_vector(20, 2, -2, 200);
     const auto result = op1 * op2 - 4;
-    print_type(result);
-    // >>> kfr::vec<int, 4>:
-    // >>>        -4        -2       -24     19996
+    CHECK(result == vec<int, 4>{ -4, -2, -24, 19996 });
 
     // * Transform vector:
     const vec<int, 8> numbers1 = enumerate<int, 8>();
     const vec<int, 8> numbers2 = enumerate<int, 8>() + 100;
-    print_type(odd(numbers1));
-    print_type(even(numbers2));
-    // >>> kfr::vec<int, 4>:
-    // >>>         1         3         5         7
-    // >>> kfr::vec<int, 4>:
-    // >>>       100       102       104       106
+    CHECK(odd(numbers1) == vec<int, 4>{ 1, 3, 5, 7 });
+    CHECK(even(numbers2) == vec<int, 4>{ 100, 102, 104, 106 });
 
     // * The following command pairs are equivalent:
-    print_type(permute<0, 2, 1, 3, 4, 6, 5, 7>(numbers1));
-    print_type(permute<0, 2, 1, 3>(numbers1));
-    // >>> kfr::vec<int, 8>:
-    // >>>         0         2         1         3         4         6         5         7
-    // >>> kfr::vec<int, 8>:
-    // >>>         0         2         1         3         4         6         5         7
-
-    print_type(shuffle<0, 8, 2, 10, 4, 12, 6, 14>(numbers1, numbers2));
-    print_type(shuffle<0, 8>(numbers1, numbers2));
-    // >>> kfr::vec<int, 8>:
-    // >>>         0       100         2       102         4       104         6       106
-    // >>> kfr::vec<int, 8>:
-    // >>>         0       100         2       102         4       104         6       106
-
-    print_type(blend<0, 1, 1, 0, 1, 1, 0, 1>(numbers1, numbers2));
-    print_type(blend<0, 1, 1>(numbers1, numbers2));
-    // >>> kfr::vec<int, 8>:
-    // >>>         0       101       102         3       104       105         6       107
-    // >>> kfr::vec<int, 8>:
-    // >>>         0       101       102         3       104       105         6       107
+    CHECK(permute(numbers1, elements<0, 2, 1, 3, 4, 6, 5, 7>) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+    CHECK(permute(numbers1, elements<0, 2, 1, 3>) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+
+    CHECK(shuffle(numbers1, numbers2, elements<0, 8, 2, 10, 4, 12, 6, 14>) ==
+          vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
+    CHECK(shuffle(numbers1, numbers2, elements<0, 8>) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
+
+    CHECK(blend(numbers1, numbers2, elements<0, 1, 1, 0, 1, 1, 0, 1>) ==
+          vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
+    CHECK(blend(numbers1, numbers2, elements<0, 1, 1>) ==
+          vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
 
     // * Transpose matrix:
     const auto sixteen = enumerate<float, 16>();
-    print_type(transpose<4>(sixteen));
-    // >>> kfr::vec<float, 16>:
-    // >>>         0         4         8        12         1         5         9        13
-    // >>>         2         6        10        14         3         7        11        15
-    // >>>
+    CHECK(transpose<4>(sixteen) ==
+          vec<float, 16>{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
+}
+
+int main(int /*argc*/, char** /*argv*/)
+{
+    println(library_version());
 
-    return 0;
+    return testo::run_all("", true);
 }
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -10,7 +10,6 @@
 #include <kfr/base/complex.hpp>
 #include <kfr/cometa/string.hpp>
 #include <kfr/expressions/basic.hpp>
-#include <kfr/expressions/operators.hpp>
 #include <kfr/expressions/reduce.hpp>
 #include <kfr/math.hpp>
 #include <kfr/version.hpp>
@@ -23,6 +22,7 @@ void assert_is_same()
     static_assert(std::is_same<T1, T2>::value, "");
 }
 
+
 TEST(complex_vector)
 {
     const vec<c32, 1> c32x1{ c32{ 0, 1 } };
@@ -179,6 +179,9 @@ int main(int argc, char** argv)
     static_assert(vector_width<i32, cpu_t::sse2> == 4, "");
     static_assert(vector_width<complex<i32>, cpu_t::sse2> == 2, "");
 
+    static_assert(is_numeric<vec<complex<float>, 4>>::value, "");
+    static_assert(is_numeric_args<vec<complex<float>, 4>>::value, "");
+
     static_assert(sizeof(vec<c32, 4>) == sizeof(vec<f32, 8>), "");
     static_assert(vec<f32, 4>::size() == 4, "");
     static_assert(vec<c32, 4>::size() == 4, "");
diff --git a/tests/conv_test.cpp b/tests/conv_test.cpp
@@ -23,7 +23,7 @@ TEST(test_convolve)
     univector<double, 5> b({ 0.25, 0.5, 1.0, 0.5, 0.25 });
     univector<double> c = convolve(a, b);
     CHECK(c.size() == 9);
-    CHECK(native::rms(c - univector<double>({ 0.25, 1., 2.75, 5., 7.5, 8.5, 7.75, 3.5, 1.25 })) < 0.0001);
+    CHECK(rms(c - univector<double>({ 0.25, 1., 2.75, 5., 7.5, 8.5, 7.75, 3.5, 1.25 })) < 0.0001);
 }
 
 int main(int argc, char** argv)
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -14,7 +14,6 @@
 #include <kfr/dft/fft.hpp>
 #include <kfr/dft/reference_dft.hpp>
 #include <kfr/expressions/basic.hpp>
-#include <kfr/expressions/operators.hpp>
 #include <kfr/expressions/reduce.hpp>
 #include <kfr/io/tostring.hpp>
 #include <kfr/math.hpp>
diff --git a/tests/empty_test.cpp b/tests/empty_test.cpp
@@ -1,7 +1,5 @@
 #include <kfr/math.hpp>
-#include <kfr/vec.hpp>
 
 using namespace kfr;
-using namespace kfr::native;
 
 int main(int argc, char** argv) { return 0; }
diff --git a/tests/fracdelay_test.cpp b/tests/fracdelay_test.cpp
@@ -20,17 +20,17 @@ using namespace kfr;
 TEST(test_fracdelay)
 {
     univector<double, 5> a({ 1, 2, 3, 4, 5 });
-    univector<double, 5> b = native::fracdelay(a, 0.5);
-    CHECK(native::rms(b - univector<double>({ 0.5, 1.5, 2.5, 3.5, 4.5 })) < c_epsilon<double> * 5);
+    univector<double, 5> b = fracdelay(a, 0.5);
+    CHECK(rms(b - univector<double>({ 0.5, 1.5, 2.5, 3.5, 4.5 })) < c_epsilon<double> * 5);
 
-    b = native::fracdelay(a, 0.1);
-    CHECK(native::rms(b - univector<double>({ 0.9, 1.9, 2.9, 3.9, 4.9 })) < c_epsilon<double> * 5);
+    b = fracdelay(a, 0.1);
+    CHECK(rms(b - univector<double>({ 0.9, 1.9, 2.9, 3.9, 4.9 })) < c_epsilon<double> * 5);
 
-    b = native::fracdelay(a, 0.0);
-    CHECK(native::rms(b - univector<double>({ 1, 2, 3, 4, 5 })) < c_epsilon<double> * 5);
+    b = fracdelay(a, 0.0);
+    CHECK(rms(b - univector<double>({ 1, 2, 3, 4, 5 })) < c_epsilon<double> * 5);
 
-    b = native::fracdelay(a, 1.0);
-    CHECK(native::rms(b - univector<double>({ 0, 1, 2, 3, 4 })) < c_epsilon<double> * 5);
+    b = fracdelay(a, 1.0);
+    CHECK(rms(b - univector<double>({ 0, 1, 2, 3, 4 })) < c_epsilon<double> * 5);
 }
 
 int main(int argc, char** argv)
diff --git a/tests/stat_test.cpp b/tests/stat_test.cpp
@@ -20,32 +20,32 @@ TEST(test_stat)
 {
     {
         univector<float, 5> a({ 1, 2, 3, 4, 5 });
-        CHECK(native::sum(a) == 15);
-        CHECK(native::mean(a) == 3);
-        CHECK(native::min(a) == 1);
-        CHECK(native::max(a) == 5);
-        CHECK(native::sumsqr(a) == 55);
-        CHECK(native::rms(a) == 3.316624790355399849115f);
-        CHECK(native::product(a) == 120);
+        CHECK(sum(a) == 15);
+        CHECK(mean(a) == 3);
+        CHECK(minof(a) == 1);
+        CHECK(maxof(a) == 5);
+        CHECK(sumsqr(a) == 55);
+        CHECK(rms(a) == 3.316624790355399849115f);
+        CHECK(product(a) == 120);
     }
     {
         univector<double, 5> a({ 1, 2, 3, 4, 5 });
-        CHECK(native::sum(a) == 15);
-        CHECK(native::mean(a) == 3);
-        CHECK(native::min(a) == 1);
-        CHECK(native::max(a) == 5);
-        CHECK(native::sumsqr(a) == 55);
-        CHECK(native::rms(a) == 3.316624790355399849115);
-        CHECK(native::product(a) == 120);
+        CHECK(sum(a) == 15);
+        CHECK(mean(a) == 3);
+        CHECK(minof(a) == 1);
+        CHECK(maxof(a) == 5);
+        CHECK(sumsqr(a) == 55);
+        CHECK(rms(a) == 3.316624790355399849115);
+        CHECK(product(a) == 120);
     }
     {
         univector<int, 5> a({ 1, 2, 3, 4, 5 });
-        CHECK(native::sum(a) == 15);
-        CHECK(native::mean(a) == 3);
-        CHECK(native::min(a) == 1);
-        CHECK(native::max(a) == 5);
-        CHECK(native::sumsqr(a) == 55);
-        CHECK(native::product(a) == 120);
+        CHECK(sum(a) == 15);
+        CHECK(mean(a) == 3);
+        CHECK(minof(a) == 1);
+        CHECK(maxof(a) == 5);
+        CHECK(sumsqr(a) == 55);
+        CHECK(product(a) == 120);
     }
 }

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	examples/biquads.cpp	\|	2	--
M	examples/dft.cpp	\|	5	++---
M	examples/fir.cpp	\|	2	--
M	examples/resampling.cpp	\|	5	-----
M	examples/window.cpp	\|	2	--
M	include/kfr/base/abs.hpp	\|	128	+++++++++++++++++++++++++------------------------------------------------------
M	include/kfr/base/asin_acos.hpp	\|	66	+++++++++++++++++++++---------------------------------------------
M	include/kfr/base/atan.hpp	\|	407	+++++++++++++++++++++++++++++++++++++++----------------------------------------
A	include/kfr/base/clamp.hpp	\|	73	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/base/complex.hpp	\|	455	++++++++++++++++++++++++++++++++++---------------------------------------------
M	include/kfr/base/gamma.hpp	\|	66	+++++++++++++++++++++++++++---------------------------------------
M	include/kfr/base/hyperbolic.hpp	\|	178	++++++++++++++++++++++++++++++++++++-------------------------------------------
M	include/kfr/base/log_exp.hpp	\|	768	++++++++++++++++++++++++++++++++++++++-----------------------------------------
M	include/kfr/base/logical.hpp	\|	466	+++++++++++++++++++++++++++++--------------------------------------------------
M	include/kfr/base/min_max.hpp	\|	436	+++++++++++++++++++++----------------------------------------------------------
A	include/kfr/base/modzerobessel.hpp	\|	113	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/base/operators.hpp	\|	36	++++++++++++++++++++++++++++++++++++
M	include/kfr/base/round.hpp	\|	377	++++++++++++++++++++++++++++++++-----------------------------------------------
M	include/kfr/base/saturation.hpp	\|	246	+++++++++++++++++++++++++++++++------------------------------------------------
M	include/kfr/base/select.hpp	\|	205	++++++++++++++++++++-----------------------------------------------------------
M	include/kfr/base/sin_cos.hpp	\|	685	+++++++++++++++++++++++++++++++++----------------------------------------------
M	include/kfr/base/sqrt.hpp	\|	79	++++++++++++++++++++++---------------------------------------------------------
M	include/kfr/base/tan.hpp	\|	227	+++++++++++++++++++++++++++++++++++--------------------------------------------
M	include/kfr/dft/conv.hpp	\|	1	-
M	include/kfr/dft/fft.hpp	\|	4	++--
M	include/kfr/dft/ft.hpp	\|	2	+-
M	include/kfr/dsp/fir.hpp	\|	115	+++++++++++++++++++++++++++++++++----------------------------------------------
M	include/kfr/dsp/fir_design.hpp	\|	256	+++++++++++++++++++++++++------------------------------------------------------
M	include/kfr/dsp/fracdelay.hpp	\|	8	++------
M	include/kfr/dsp/oscillators.hpp	\|	300	+++++++++++++++++++++++++++++++++----------------------------------------------
M	include/kfr/dsp/resample.hpp	\|	303	++++++++++++++++++++++++++++++++++++-------------------------------------------
M	include/kfr/dsp/units.hpp	\|	239	++++++++++++++++++++++++++++++++++++-------------------------------------------
M	include/kfr/dsp/weighting.hpp	\|	139	+++++++++++++++++++++++++++++++++++++------------------------------------------
M	include/kfr/dsp/window.hpp	\|	901	++++++++++++++++++++++++++++++++++++-------------------------------------------
M	include/kfr/expressions/reduce.hpp	\|	228	+++++++++++++++++++++++++++-----------------------------------------------------
M	include/kfr/math.hpp	\|	4	----
M	sources.cmake	\|	3	++-
M	tests/basic_vector_test.cpp	\|	119	+++++++++++++++++++++++++------------------------------------------------------
M	tests/complex_test.cpp	\|	5	++++-
M	tests/conv_test.cpp	\|	2	+-
M	tests/dft_test.cpp	\|	1	-
M	tests/empty_test.cpp	\|	2	--
M	tests/fracdelay_test.cpp	\|	16	++++++++--------
M	tests/stat_test.cpp	\|	40	++++++++++++++++++++--------------------