KFR 3.0 fixes/tests - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 51f526fdde45132c83c9c6368a770aed291be3f2
parent 509d0251987260ef1f7845592147120e1158b75b
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Mon, 19 Nov 2018 05:41:10 +0300

KFR 3.0 fixes/tests

Diffstat:
M CMakeLists.txt  | 10 ++--------
M examples/CMakeLists.txt  | 2 +-
M include/kfr/base/atan.hpp  | 8 ++++----
M include/kfr/base/basic_expressions.hpp  | 1 +
M include/kfr/base/logical.hpp  | 3 ++-
M include/kfr/base/random.hpp  | 11 +++++++----
M include/kfr/base/round.hpp  | 8 ++++----
M include/kfr/base/select.hpp  | 2 +-
M include/kfr/base/simd_clang.hpp  | 2 +-
M include/kfr/base/sin_cos.hpp  | 33 +++++++++++++++++----------------
M include/kfr/base/tan.hpp  | 16 ++++++++++------
M include/kfr/base/vec.hpp  | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
M include/kfr/cident.h  | 4 ++++
M include/kfr/dft/convolution.hpp  | 133 ++++++++++++++-----------------------------------------------------------------
M include/kfr/dft/dft-src.cpp  | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M tests/CMakeLists.txt  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
A tests/all_tests.cpp  | 22 ++++++++++++++++++++++
M tests/base_test.cpp  | 2 ++
M tests/complex_test.cpp  | 2 ++
M tests/dft_test.cpp  | 12 +++++++-----
M tests/dsp_test.cpp  | 2 ++
M tests/expression_test.cpp  | 2 ++
M tests/intrinsic_test.cpp  | 26 +++++++++++++++++++++-----
M tests/transcendental_test.cpp  | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---

24 files changed, 484 insertions(+), 205 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,13 +15,7 @@
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 2.8)
-
-if (CMAKE_VERSION VERSION_LESS "2.8.12")
-    function(add_compile_options)
-        add_definitions(${ARGN})
-    endfunction(add_compile_options)
-endif ()
+cmake_minimum_required(VERSION 3.0)
 
 #set(DISABLE_CLANG 1)
 
@@ -91,7 +85,7 @@ if (${CMAKE_GENERATOR} STREQUAL "MinGW Makefiles"
     OR ${CMAKE_GENERATOR} STREQUAL "MSYS Makefiles"
     OR ${CMAKE_GENERATOR} STREQUAL "Unix Makefiles"
     OR ${CMAKE_GENERATOR} STREQUAL "Ninja")
-    
+
     if (CMAKE_CXX_COMPILER MATCHES "clang" AND NOT CMAKE_CXX_COMPILER MATCHES "clang-cl")
         if (WIN32)
             # On windows, clang requires --target
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -15,7 +15,7 @@
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 
 include_directories(../include)
 
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -114,8 +114,8 @@ KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x)
     r                       = mulsign(r, x);
     r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
     r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
-    r = select(y == 0.0f, (x < 0) & pi, r);
-    r = (isnan(x) || isnan(y)) | mulsign(r, y);
+    r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r);
+    r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
     return r;
 }
 
@@ -129,8 +129,8 @@ KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
     r                       = mulsign(r, x);
     r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r);
     r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r);
-    r = select(y == 0.0, (x < 0) & pi, r);
-    r = (isnan(x) || isnan(y)) | mulsign(r, y);
+    r = select(y == 0.0, select(x < 0., pi, 0.), r);
+    r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
     return r;
 }
 
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -27,6 +27,7 @@
 
 #include "univector.hpp"
 #include "vec.hpp"
+#include "operators.hpp"
 #include <algorithm>
 
 namespace kfr
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -37,7 +37,8 @@ namespace intrinsics
 template <size_t bits>
 struct bitmask
 {
-    using type = findinttype<0, (1ull << bits) - 1>;
+    using type = conditional<(bits > 32), uint64_t,
+                             conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>;
 
     bitmask(type val) : value(val) {}
 
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -82,11 +82,14 @@ inline vec<u8, N> random_bits(random_bit_generator& gen)
 {
     return narrow<N>(bitcast<u8>(gen()));
 }
+
 template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
 inline vec<u8, N> random_bits(random_bit_generator& gen)
 {
-    constexpr size_t N2 = prev_poweroftwo(N - 1);
-    return concat(random_bits<N2>(gen), random_bits<N - N2>(gen));
+    constexpr size_t N2         = prev_poweroftwo(N - 1);
+    const vec<u8, N2> bits1     = random_bits<N2>(gen);
+    const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen);
+    return concat(bits1, bits2);
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(std::is_integral<T>::value)>
@@ -162,7 +165,7 @@ struct expression_random_range : input_expression
     const T min;
     const T max;
 };
-}
+} // namespace internal
 
 template <typename T>
 inline internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen)
@@ -187,4 +190,4 @@ inline internal::expression_random_range<T> gen_random_range(T min, T max)
 {
     return internal::expression_random_range<T>(random_bit_generator(seed_from_rdtsc), min, max);
 }
-}
+} // namespace kfr
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -116,25 +116,25 @@ template <size_t N>
 KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x)
 {
     vec<f32, N> t = cast<f32>(cast<i32>(x));
-    return t - ((x < t) & 1.f);
+    return t - select(x < t, 1.f, 0.f);
 }
 template <size_t N>
 KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x)
 {
     vec<f64, N> t = cast<f64>(cast<i64>(x));
-    return t - ((x < t) & 1.0);
+    return t - select(x < t, 1., 0.);
 }
 template <size_t N>
 KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x)
 {
     vec<f32, N> t = cast<f32>(cast<i32>(x));
-    return t + ((x > t) & 1.f);
+    return t + select(x > t, 1.f, 0.f);
 }
 template <size_t N>
 KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x)
 {
     vec<f64, N> t = cast<f64>(cast<i64>(x));
-    return t + ((x > t) & 1.0);
+    return t + select(x > t, 1., 0.);
 }
 template <size_t N>
 KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x)
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -167,7 +167,7 @@ KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, co
 template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
 KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
-    return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c)));
+    return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
 KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
diff --git a/include/kfr/base/simd_clang.hpp b/include/kfr/base/simd_clang.hpp
@@ -189,7 +189,7 @@ struct alignas(alignof(internal::simd_type<T, N>)) vec : public vec_t<T, N>
     constexpr friend mask_t operator<=(const vec& x, const vec& y) noexcept { return KFR_S(*x <= *y); }
     constexpr friend mask_t operator>=(const vec& x, const vec& y) noexcept { return KFR_S(*x >= *y); }
 
-    constexpr mask_t asmask() const noexcept { return mask_t(*this); }
+    constexpr mask_t asmask() const noexcept { return mask_t(simd); }
 
 #undef KFR_S
 #undef KFR_U
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -61,14 +61,14 @@ template <typename T, size_t N, typename Tprecise = f64>
 KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
 {
     const vec<T, N> xabs = abs(x);
-    constexpr T div = constants<T>::fold_constant_div;
-    vec<T, N> y = floor(xabs / div);
-    quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
+    constexpr T div      = constants<T>::fold_constant_div;
+    vec<T, N> y          = floor(xabs / div);
+    quadrant             = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
 
     const mask<T, N> msk = (quadrant & 1) != 0;
-    quadrant = kfr::select(msk, quadrant + 1, quadrant);
-    y        = select(msk, y + T(1.0), y);
-    quadrant = quadrant & 7;
+    quadrant             = kfr::select(msk, quadrant + 1, quadrant);
+    y                    = select(msk, y + T(1.0), y);
+    quadrant             = quadrant & 7;
 
     constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi);
     constexpr T rem1      = constants<T>::fold_constant_rem1;
@@ -134,10 +134,11 @@ KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cos
     vec<itype<T>, N> quadrant;
     vec<T, N> folded = trig_fold(x_full, quadrant);
 
-    mask<T, N> flip_sign = kfr::select(cosmask, (quadrant == 2) || (quadrant == 4), quadrant >= 4);
+    mask<T, N> flip_sign =
+        kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask();
 
     mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
-    usecos = usecos ^ cosmask;
+    usecos            = usecos ^ cosmask;
 
     vec<T, N> formula = trig_sincos(folded, usecos);
 
@@ -160,7 +161,7 @@ KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x)
 
     vec<T, N> formula = trig_sincos(folded, usecos);
 
-    formula = select(flip_sign ^ x, -formula, formula);
+    formula = select(flip_sign ^ mask<T, N>(x), -formula, formula);
     return formula;
 }
 
@@ -193,15 +194,15 @@ KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x)
 
     vec<T, N> xx = x - pi;
     vec<T, N> y  = abs(xx);
-    y = select(y > c_pi<T, 1, 2>, pi - y, y);
-    y = y ^ (msk & ~xx);
+    y            = select(y > c_pi<T, 1, 2>, pi - y, y);
+    y            = y ^ (msk & ~xx);
 
     vec<T, N> y2      = y * y;
     vec<T, N> formula = c6;
     vec<T, N> y3      = y2 * y;
-    formula = fmadd(formula, y2, c4);
-    formula = fmadd(formula, y2, c2);
-    formula = formula * y3 + y;
+    formula           = fmadd(formula, y2, c4);
+    formula           = fmadd(formula, y2, c2);
+    formula           = formula * y3 + y;
     return formula;
 }
 
@@ -316,7 +317,7 @@ KFR_SINTRIN Tout cossindeg(const T& x)
 {
     return cossin(x * constants<Tout>::degtorad);
 }
-}
+} // namespace intrinsics
 
 KFR_I_FN(sin)
 KFR_I_FN(cos)
@@ -616,4 +617,4 @@ KFR_SINTRIN T cos3x(const T& sinx, const T& cosx)
 {
     return cosx * (1 - 4 * sqr(sinx));
 }
-}
+} // namespace kfr
diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp
@@ -51,10 +51,10 @@ KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inve
     vec<T, N> x = y - k_real * pi_14;
 
     mask<T, N> need_offset = (k & 1) != 0;
-    x = select(need_offset, x - pi_14, x);
+    x                      = select(need_offset, x - pi_14, x);
 
     vec<IT, N> k_mod4 = k & 3;
-    inverse = (k_mod4 == 1) || (k_mod4 == 2);
+    inverse           = (k_mod4 == 1) || (k_mod4 == 2);
     return x;
 }
 
@@ -62,7 +62,9 @@ template <size_t N>
 KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full)
 {
     mask<f32, N> inverse;
-    const vec<f32, N> x = trig_fold_simple(x_full, inverse);
+    vec<i32, N> quad;
+    const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+    inverse             = quad == 2 || quad == 6;
 
     constexpr f32 tan_c2  = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01);
     constexpr f32 tan_c4  = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01);
@@ -90,7 +92,9 @@ template <size_t N>
 KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full)
 {
     mask<f64, N> inverse;
-    const vec<f64, N> x = trig_fold_simple(x_full, inverse);
+    vec<i64, N> quad;
+    const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+    inverse             = quad == 2 || quad == 6;
 
     constexpr f64 tan_c2  = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01);
     constexpr f64 tan_c4  = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01);
@@ -129,7 +133,7 @@ KFR_SINTRIN flt_type<T> tandeg(const T& x)
 {
     return tan(x * c_degtorad<flt_type<T>>);
 }
-}
+} // namespace intrinsics
 KFR_I_FN(tan)
 KFR_I_FN(tandeg)
 
@@ -156,4 +160,4 @@ KFR_FUNC internal::expression_function<fn::tandeg, E1> tandeg(E1&& x)
 {
     return { fn::tandeg(), std::forward<E1>(x) };
 }
-}
+} // namespace kfr
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -37,7 +37,7 @@ template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
 CMT_INLINE vec<T, Nout> low(const vec<T, N>& x);
 template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
 CMT_INLINE vec<T, Nout> high(const vec<T, N>& x);
-}
+} // namespace kfr
 
 #ifdef CMT_COMPILER_CLANG
 #include "simd_clang.hpp"
@@ -65,7 +65,7 @@ template <typename T>
 using maskfor = typename T::mask_t;
 
 template <typename T, size_t N>
-struct mask : vec<T, N>
+struct mask : protected vec<T, N>
 {
     using base                          = vec<T, N>;
     KFR_I_CE mask() noexcept            = default;
@@ -73,17 +73,31 @@ struct mask : vec<T, N>
     KFR_I_CE mask& operator=(const mask&) noexcept = default;
     using simd_type                                = typename base::simd_type;
 
-    KFR_I_CE mask(const base& v) noexcept : base(v) {}
+    simd_type operator*() const noexcept { return this->simd; }
+    simd_type& operator*() noexcept { return this->simd; }
+
+    KFR_I_CE mask(const base& v) noexcept
+        : base(base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()))
+    {
+    }
 
     KFR_I_CE mask(const simd_type& simd) : base(simd) {}
     template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
-    KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m))
+    KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
     {
     }
     template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
     KFR_I_CE mask(const vec<U, N>& m) : base(base::frombits(m))
     {
     }
+    KFR_I_CE mask operator&(const mask& y) const noexcept
+    {
+        return static_cast<const base&>(*this) & static_cast<const base&>(y);
+    }
+    KFR_I_CE mask operator|(const mask& y) const noexcept
+    {
+        return static_cast<const base&>(*this) | static_cast<const base&>(y);
+    }
     KFR_I_CE mask operator&&(const mask& y) const noexcept
     {
         return static_cast<const base&>(*this) & static_cast<const base&>(y);
@@ -96,7 +110,7 @@ struct mask : vec<T, N>
     {
         return static_cast<const base&>(*this) ^ static_cast<const base&>(y);
     }
-    KFR_I_CE mask operator^(const base& y) const noexcept { return static_cast<const base&>(*this) ^ y; }
+    KFR_I_CE mask operator~() const noexcept { return ~static_cast<const base&>(*this); }
 
     bool operator[](size_t index) const noexcept;
 
@@ -123,7 +137,7 @@ constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> 
 {
     return {};
 }
-}
+} // namespace internal
 
 template <size_t groupsize, size_t... indices>
 constexpr inline auto scale() noexcept
@@ -209,13 +223,13 @@ struct vec<vec<T, Nin>, N> : private vec<T, Nin * N>
 
     constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
     constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
-    constexpr friend vec operator++(vec& x, int)noexcept
+    constexpr friend vec operator++(vec& x, int) noexcept
     {
         const vec z = x;
         ++x;
         return z;
     }
-    constexpr friend vec operator--(vec& x, int)noexcept
+    constexpr friend vec operator--(vec& x, int) noexcept
     {
         const vec z = x;
         --x;
@@ -256,10 +270,9 @@ struct vec<vec<T, Nin>, N> : private vec<T, Nin * N>
     CMT_GNU_CONSTEXPR void set(csize_t<index>, const value_type& s) noexcept
     {
         *this = vec(static_cast<const base&>(*this))
-                    .shuffle(s, csizeseq_t<N>() +
-                                    (csizeseq_t<N>() >= csize_t<index * Nin>() &&
-                                     csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) *
-                                        N);
+                    .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * Nin>() &&
+                                                   csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) *
+                                                      N);
     }
     struct element
     {
@@ -314,7 +327,7 @@ template <typename T, size_t N>
 struct is_vec_impl<vec<T, N>> : std::true_type
 {
 };
-}
+} // namespace internal
 
 template <typename T>
 using is_vec = internal::is_vec_impl<T>;
@@ -365,7 +378,7 @@ constexpr swiz<12> s12{};
 constexpr swiz<13> s13{};
 constexpr swiz<14> s14{};
 constexpr swiz<15> s15{};
-}
+} // namespace swizzle
 #endif
 
 CMT_PRAGMA_GNU(GCC diagnostic push)
@@ -425,7 +438,7 @@ struct conversion<vec<To, N>, From>
     static_assert(std::is_convertible<From, To>::value, "");
     static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
 };
-}
+} // namespace internal
 
 template <typename T>
 constexpr size_t size_of() noexcept
@@ -524,7 +537,7 @@ struct shuffle_index_wrap
 {
     constexpr inline size_t operator()(size_t index) const { return (start + index * stride) % count; }
 };
-}
+} // namespace internal
 
 template <size_t count, typename T, size_t N, size_t Nout = N* count>
 CMT_INLINE vec<T, Nout> repeat(const vec<T, N>& x)
@@ -585,7 +598,7 @@ CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, co
     const T list[] = { static_cast<T>(args)... };
     return vec<T, N>(list[indices]...);
 }
-}
+} // namespace internal
 
 /// Create vector from scalar values
 /// @code
@@ -710,7 +723,7 @@ constexpr CMT_INLINE vec<T1, N> operator^(const T1& x, const vec<T1, N>& y)
 {
     return static_cast<vec<T1, N>>(x) ^ y;
 }
-}
+} // namespace operators
 
 using namespace operators;
 
@@ -739,7 +752,7 @@ constexpr vec<T, Nout> partial_mask()
 {
     return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>());
 }
-}
+} // namespace internal
 
 template <typename T>
 using optvec = vec<T, platform<T>::vector_capacity / 4>;
@@ -751,6 +764,7 @@ using f32x4  = vec<f32, 4>;
 using f32x8  = vec<f32, 8>;
 using f32x16 = vec<f32, 16>;
 using f32x32 = vec<f32, 32>;
+using f32x64 = vec<f32, 64>;
 using f64x1  = vec<f64, 1>;
 using f64x2  = vec<f64, 2>;
 using f64x3  = vec<f64, 3>;
@@ -758,6 +772,7 @@ using f64x4  = vec<f64, 4>;
 using f64x8  = vec<f64, 8>;
 using f64x16 = vec<f64, 16>;
 using f64x32 = vec<f64, 32>;
+using f64x64 = vec<f64, 64>;
 using i8x1   = vec<i8, 1>;
 using i8x2   = vec<i8, 2>;
 using i8x3   = vec<i8, 3>;
@@ -765,6 +780,7 @@ using i8x4   = vec<i8, 4>;
 using i8x8   = vec<i8, 8>;
 using i8x16  = vec<i8, 16>;
 using i8x32  = vec<i8, 32>;
+using i8x64  = vec<i8, 64>;
 using i16x1  = vec<i16, 1>;
 using i16x2  = vec<i16, 2>;
 using i16x3  = vec<i16, 3>;
@@ -772,6 +788,7 @@ using i16x4  = vec<i16, 4>;
 using i16x8  = vec<i16, 8>;
 using i16x16 = vec<i16, 16>;
 using i16x32 = vec<i16, 32>;
+using i16x64 = vec<i16, 64>;
 using i32x1  = vec<i32, 1>;
 using i32x2  = vec<i32, 2>;
 using i32x3  = vec<i32, 3>;
@@ -779,6 +796,7 @@ using i32x4  = vec<i32, 4>;
 using i32x8  = vec<i32, 8>;
 using i32x16 = vec<i32, 16>;
 using i32x32 = vec<i32, 32>;
+using i32x64 = vec<i32, 64>;
 using i64x1  = vec<i64, 1>;
 using i64x2  = vec<i64, 2>;
 using i64x3  = vec<i64, 3>;
@@ -786,6 +804,7 @@ using i64x4  = vec<i64, 4>;
 using i64x8  = vec<i64, 8>;
 using i64x16 = vec<i64, 16>;
 using i64x32 = vec<i64, 32>;
+using i64x64 = vec<i64, 64>;
 using u8x1   = vec<u8, 1>;
 using u8x2   = vec<u8, 2>;
 using u8x3   = vec<u8, 3>;
@@ -793,6 +812,7 @@ using u8x4   = vec<u8, 4>;
 using u8x8   = vec<u8, 8>;
 using u8x16  = vec<u8, 16>;
 using u8x32  = vec<u8, 32>;
+using u8x64  = vec<u8, 64>;
 using u16x1  = vec<u16, 1>;
 using u16x2  = vec<u16, 2>;
 using u16x3  = vec<u16, 3>;
@@ -800,6 +820,7 @@ using u16x4  = vec<u16, 4>;
 using u16x8  = vec<u16, 8>;
 using u16x16 = vec<u16, 16>;
 using u16x32 = vec<u16, 32>;
+using u16x64 = vec<u16, 64>;
 using u32x1  = vec<u32, 1>;
 using u32x2  = vec<u32, 2>;
 using u32x3  = vec<u32, 3>;
@@ -807,6 +828,7 @@ using u32x4  = vec<u32, 4>;
 using u32x8  = vec<u32, 8>;
 using u32x16 = vec<u32, 16>;
 using u32x32 = vec<u32, 32>;
+using u32x64 = vec<u32, 64>;
 using u64x1  = vec<u64, 1>;
 using u64x2  = vec<u64, 2>;
 using u64x3  = vec<u64, 3>;
@@ -814,6 +836,7 @@ using u64x4  = vec<u64, 4>;
 using u64x8  = vec<u64, 8>;
 using u64x16 = vec<u64, 16>;
 using u64x32 = vec<u64, 32>;
+using u64x64 = vec<u64, 64>;
 
 using u8x2x2  = vec<vec<u8, 2>, 2>;
 using i8x2x2  = vec<vec<i8, 2>, 2>;
@@ -851,7 +874,7 @@ using ivec4 = i32x4;
 using uvec2 = u32x2;
 using uvec3 = u32x3;
 using uvec4 = u32x4;
-}
+} // namespace glsl_names
 namespace opencl_names
 {
 using char2   = i8x2;
@@ -909,7 +932,7 @@ using double3  = f64x3;
 using double4  = f64x4;
 using double8  = f64x8;
 using double16 = f64x16;
-}
+} // namespace opencl_names
 
 namespace internal
 {
@@ -958,7 +981,7 @@ constexpr CMT_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
 {
     return make_vector(((void)Indices, void(), fn())...);
 }
-}
+} // namespace internal
 
 template <typename T, size_t N, typename Fn, typename... Args,
           typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
@@ -1048,7 +1071,7 @@ CMT_INLINE vec_t<T, Nout> high(vec_t<T, N>)
 }
 KFR_FN(low)
 KFR_FN(high)
-}
+} // namespace kfr
 
 namespace cometa
 {
@@ -1105,7 +1128,7 @@ struct compound_type_traits<kfr::mask<T, N>>
         return value[index];
     }
 };
-}
+} // namespace cometa
 
 namespace std
 {
@@ -1140,7 +1163,7 @@ struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
 {
     using type = kfr::mask<typename common_type<T1, T2>::type, N>;
 };
-}
+} // namespace std
 
 CMT_PRAGMA_GNU(GCC diagnostic pop)
 CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -20,6 +20,8 @@ extern char* gets(char* __s);
 #define CMT_ARCH_X32 1
 #endif
 
+#ifndef CMT_FORCE_GENERIC_CPU
+
 #if defined __AVX512F__ && !defined CMT_ARCH_AVX512
 #define CMT_ARCH_AVX512 1
 #define CMT_ARCH_AVX2 1
@@ -107,6 +109,8 @@ extern char* gets(char* __s);
 #define CMT_ARCH_LZCNT 1
 #endif
 
+#endif // CMT_FORCE_GENERIC_CPU
+
 #if defined CMT_ARCH_AVX512
 #define CMT_ARCH_NAME avx512
 #elif defined CMT_ARCH_AVX2
diff --git a/include/kfr/dft/convolution.hpp b/include/kfr/dft/convolution.hpp
@@ -26,6 +26,7 @@
 #pragma once
 
 #include "../base/complex.hpp"
+#include "../base/filter.hpp"
 #include "../base/constants.hpp"
 #include "../base/memory.hpp"
 #include "../base/read_write.hpp"
@@ -42,88 +43,41 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 namespace kfr
 {
 
+namespace internal
+{
+template <typename T>
+univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
+template <typename T>
+univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
+template <typename T>
+univector<T> autocorrelate(const univector_ref<const T>& src1);
+} // namespace internal
+
 template <typename T, size_t Tag1, size_t Tag2>
-CMT_FUNC univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
+univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
 {
-    const size_t size                = next_poweroftwo(src1.size() + src2.size() - 1);
-    univector<complex<T>> src1padded = src1;
-    univector<complex<T>> src2padded = src2;
-    src1padded.resize(size, 0);
-    src2padded.resize(size, 0);
-
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
-    univector<u8> temp(dft->temp_size);
-    dft->execute(src1padded, src1padded, temp);
-    dft->execute(src2padded, src2padded, temp);
-    src1padded = src1padded * src2padded;
-    dft->execute(src1padded, src1padded, temp, true);
-    const T invsize = reciprocal<T>(size);
-    return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+    return internal::convolve(src1.slice(), src2.slice());
 }
 
 template <typename T, size_t Tag1, size_t Tag2>
-CMT_FUNC univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
+univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
 {
-    const size_t size                = next_poweroftwo(src1.size() + src2.size() - 1);
-    univector<complex<T>> src1padded = src1;
-    univector<complex<T>> src2padded = reverse(src2);
-    src1padded.resize(size, 0);
-    src2padded.resize(size, 0);
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
-    univector<u8> temp(dft->temp_size);
-    dft->execute(src1padded, src1padded, temp);
-    dft->execute(src2padded, src2padded, temp);
-    src1padded = src1padded * src2padded;
-    dft->execute(src1padded, src1padded, temp, true);
-    const T invsize = reciprocal<T>(size);
-    return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+    return internal::correlate(src1.slice(), src2.slice());
 }
 
 template <typename T, size_t Tag1>
-CMT_FUNC univector<T> autocorrelate(const univector<T, Tag1>& src)
+univector<T> autocorrelate(const univector<T, Tag1>& src)
 {
-    univector<T> result = correlate(src, src);
-    result              = result.slice(result.size() / 2);
-    return result;
+    return internal::autocorrelate(src.slice());
 }
 
 template <typename T>
 class convolve_filter : public filter<T>
 {
 public:
-    explicit convolve_filter(size_t size, size_t block_size = 1024)
-        : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size),
-          segments((size + block_size - 1) / block_size)
-    {
-    }
-    explicit convolve_filter(const univector<T>& data, size_t block_size = 1024)
-        : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)),
-          temp(fft.temp_size),
-          segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
-          ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
-          input_position(0), position(0)
-    {
-        set_data(data);
-    }
-    void set_data(const univector<T>& data)
-    {
-        univector<T> input(fft.size);
-        const T ifftsize = reciprocal(T(fft.size));
-        for (size_t i = 0; i < ir_segments.size(); i++)
-        {
-            segments[i].resize(block_size);
-            ir_segments[i].resize(block_size, 0);
-            input = padded(data.slice(i * block_size, block_size));
-
-            fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm);
-            process(ir_segments[i], ir_segments[i] * ifftsize);
-        }
-        saved_input.resize(block_size, 0);
-        scratch.resize(block_size * 2);
-        premul.resize(block_size, 0);
-        cscratch.resize(block_size);
-        overlap.resize(block_size, 0);
-    }
+    explicit convolve_filter(size_t size, size_t block_size = 1024);
+    explicit convolve_filter(const univector<T>& data, size_t block_size = 1024);
+    void set_data(const univector<T>& data);
 
 protected:
     void process_expression(T* dest, const expression_pointer<T>& src, size_t size) final
@@ -131,49 +85,7 @@ protected:
         univector<T> input = truncate(src, size);
         process_buffer(dest, input.data(), input.size());
     }
-    void process_buffer(T* output, const T* input, size_t size) final
-    {
-        size_t processed = 0;
-        while (processed < size)
-        {
-            const size_t processing = std::min(size - processed, block_size - input_position);
-            internal::builtin_memcpy(saved_input.data() + input_position, input + processed,
-                                     processing * sizeof(T));
-
-            process(scratch, padded(saved_input));
-            fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
-
-            if (input_position == 0)
-            {
-                process(premul, zeros());
-                for (size_t i = 1; i < segments.size(); i++)
-                {
-                    const size_t n = (position + i) % segments.size();
-                    fft_multiply_accumulate(premul, ir_segments[i], segments[n], dft_pack_format::Perm);
-                }
-            }
-            fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position],
-                                    dft_pack_format::Perm);
-
-            fft.execute(scratch, cscratch, temp, dft_pack_format::Perm);
-
-            process(make_univector(output + processed, processing),
-                    scratch.slice(input_position) + overlap.slice(input_position));
-
-            input_position += processing;
-            if (input_position == block_size)
-            {
-                input_position = 0;
-                process(saved_input, zeros());
-
-                internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
-
-                position = position > 0 ? position - 1 : segments.size() - 1;
-            }
-
-            processed += processing;
-        }
-    }
+    void process_buffer(T* output, const T* input, size_t size) final;
 
     const dft_plan_real<T> fft;
     univector<u8> temp;
@@ -189,5 +101,6 @@ protected:
     univector<T> overlap;
     size_t position;
 };
-}
+
+} // namespace kfr
 CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/dft/dft-src.cpp b/include/kfr/dft/dft-src.cpp
@@ -24,10 +24,13 @@
   See https://www.kfrlib.com for details.
  */
 
+#include "../base/basic_expressions.hpp"
+#include "../testo/assert.hpp"
 #include "bitrev.hpp"
+#include "cache.hpp"
+#include "convolution.hpp"
 #include "fft.hpp"
 #include "ft.hpp"
-#include "../testo/assert.hpp"
 
 CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wshadow")
@@ -248,7 +251,7 @@ template <typename T, size_t width>
 CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
 {
     const size_t count = stage_size / 4;
-    size_t nnstep = size / stage_size;
+    size_t nnstep      = size / stage_size;
     DFT_ASSERT(width <= count);
     CMT_LOOP_NOUNROLL
     for (size_t n = 0; n < count; n += width)
@@ -259,7 +262,7 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s
     }
 }
 
-#ifdef CMT_ARCH_X86
+#if defined CMT_ARCH_SSE
 #ifdef CMT_COMPILER_GNU
 #define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
 #else
@@ -484,11 +487,11 @@ struct fft_final_stage_impl : dft_stage<T>
     }
 
 protected:
-    constexpr static size_t width      = fft_vector_width<T>;
-    constexpr static bool is_even      = cometa::is_even(ilog2(size));
-    constexpr static bool use_br2      = !is_even;
-    constexpr static bool aligned      = false;
-    constexpr static bool prefetch     = splitin;
+    constexpr static size_t width  = fft_vector_width<T>;
+    constexpr static bool is_even  = cometa::is_even(ilog2(size));
+    constexpr static bool use_br2  = !is_even;
+    constexpr static bool aligned  = false;
+    constexpr static bool prefetch = splitin;
 
     KFR_INTRIN void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
     KFR_INTRIN void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
@@ -510,7 +513,7 @@ protected:
 
     virtual void do_execute(complex<T>* out, const complex<T>* in, u8* /*temp*/) override
     {
-        const complex<T>* twiddle   = ptr_cast<complex<T>>(this->data);
+        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         final_stage(csize<size>, 1, cbool<splitin>, out, in, twiddle);
     }
 
@@ -1064,6 +1067,164 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_
 template <typename T>
 dft_plan<T>::~dft_plan() = default;
 
+namespace internal
+{
+
+template <typename T>
+univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2)
+{
+    const size_t size                = next_poweroftwo(src1.size() + src2.size() - 1);
+    univector<complex<T>> src1padded = src1;
+    univector<complex<T>> src2padded = src2;
+    src1padded.resize(size, 0);
+    src2padded.resize(size, 0);
+
+    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
+    univector<u8> temp(dft->temp_size);
+    dft->execute(src1padded, src1padded, temp);
+    dft->execute(src2padded, src2padded, temp);
+    src1padded = src1padded * src2padded;
+    dft->execute(src1padded, src1padded, temp, true);
+    const T invsize = reciprocal<T>(size);
+    return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+}
+
+template <typename T>
+univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2)
+{
+    const size_t size                = next_poweroftwo(src1.size() + src2.size() - 1);
+    univector<complex<T>> src1padded = src1;
+    univector<complex<T>> src2padded = reverse(src2);
+    src1padded.resize(size, 0);
+    src2padded.resize(size, 0);
+    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
+    univector<u8> temp(dft->temp_size);
+    dft->execute(src1padded, src1padded, temp);
+    dft->execute(src2padded, src2padded, temp);
+    src1padded = src1padded * src2padded;
+    dft->execute(src1padded, src1padded, temp, true);
+    const T invsize = reciprocal<T>(size);
+    return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+}
+
+template <typename T>
+univector<T> autocorrelate(const univector_ref<const T>& src1)
+{
+    return internal::autocorrelate(src1.slice());
+    univector<T> result = correlate(src1, src1);
+    result              = result.slice(result.size() / 2);
+    return result;
+}
+
+template univector<float> convolve<float>(const univector_ref<const float>&,
+                                          const univector_ref<const float>&);
+template univector<double> convolve<double>(const univector_ref<const double>&,
+                                            const univector_ref<const double>&);
+template univector<float> correlate<float>(const univector_ref<const float>&,
+                                           const univector_ref<const float>&);
+template univector<double> correlate<double>(const univector_ref<const double>&,
+                                             const univector_ref<const double>&);
+
+template univector<float> autocorrelate<float>(const univector_ref<const float>&);
+template univector<double> autocorrelate<double>(const univector_ref<const double>&);
+
+} // namespace internal
+
+template <typename T>
+convolve_filter<T>::convolve_filter(size_t size, size_t block_size)
+    : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size),
+      segments((size + block_size - 1) / block_size)
+{
+}
+
+template <typename T>
+convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size)
+    : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)),
+      temp(fft.temp_size),
+      segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
+      ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
+      input_position(0), position(0)
+{
+    set_data(data);
+}
+
+template <typename T>
+void convolve_filter<T>::set_data(const univector<T>& data)
+{
+    univector<T> input(fft.size);
+    const T ifftsize = reciprocal(T(fft.size));
+    for (size_t i = 0; i < ir_segments.size(); i++)
+    {
+        segments[i].resize(block_size);
+        ir_segments[i].resize(block_size, 0);
+        input = padded(data.slice(i * block_size, block_size));
+
+        fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm);
+        process(ir_segments[i], ir_segments[i] * ifftsize);
+    }
+    saved_input.resize(block_size, 0);
+    scratch.resize(block_size * 2);
+    premul.resize(block_size, 0);
+    cscratch.resize(block_size);
+    overlap.resize(block_size, 0);
+}
+
+template <typename T>
+void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
+{
+    size_t processed = 0;
+    while (processed < size)
+    {
+        const size_t processing = std::min(size - processed, block_size - input_position);
+        internal::builtin_memcpy(saved_input.data() + input_position, input + processed,
+                                 processing * sizeof(T));
+
+        process(scratch, padded(saved_input));
+        fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
+
+        if (input_position == 0)
+        {
+            process(premul, zeros());
+            for (size_t i = 1; i < segments.size(); i++)
+            {
+                const size_t n = (position + i) % segments.size();
+                fft_multiply_accumulate(premul, ir_segments[i], segments[n], dft_pack_format::Perm);
+            }
+        }
+        fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], dft_pack_format::Perm);
+
+        fft.execute(scratch, cscratch, temp, dft_pack_format::Perm);
+
+        process(make_univector(output + processed, processing),
+                scratch.slice(input_position) + overlap.slice(input_position));
+
+        input_position += processing;
+        if (input_position == block_size)
+        {
+            input_position = 0;
+            process(saved_input, zeros());
+
+            internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
+
+            position = position > 0 ? position - 1 : segments.size() - 1;
+        }
+
+        processed += processing;
+    }
+}
+
+template convolve_filter<float>::convolve_filter(size_t, size_t);
+template convolve_filter<double>::convolve_filter(size_t, size_t);
+
+template convolve_filter<float>::convolve_filter(const univector<float>&, size_t);
+template convolve_filter<double>::convolve_filter(const univector<double>&, size_t);
+
+template void convolve_filter<float>::set_data(const univector<float>&);
+template void convolve_filter<double>::set_data(const univector<double>&);
+
+template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size);
+template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size);
+
 template dft_plan<float>::dft_plan(size_t, cbools_t<false, true>);
 template dft_plan<float>::dft_plan(size_t, cbools_t<true, false>);
 template dft_plan<float>::dft_plan(size_t, cbools_t<true, true>);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -15,7 +15,7 @@
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
 
 set(TEST_SRC
 )
@@ -38,16 +38,65 @@ endif ()
 
 find_package(MPFR)
 
+set(ALL_TESTS_CPP
+    all_tests.cpp
+    base_test.cpp
+    complex_test.cpp
+    dft_test.cpp
+    dsp_test.cpp
+    expression_test.cpp
+    intrinsic_test.cpp)
+
+if (MPFR_FOUND)
+    list(APPEND ALL_TESTS_CPP transcendental_test.cpp)
+endif ()
+
+add_executable(all_tests ${ALL_TESTS_CPP} ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp)
+target_compile_definitions(all_tests PRIVATE KFR_NO_MAIN)
+
 add_executable(intrinsic_test intrinsic_test.cpp ${KFR_SRC} ${TEST_SRC})
 add_executable(dft_test dft_test.cpp ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp)
 if (MPFR_FOUND)
+    add_definitions(-DHAVE_MPFR)
     include_directories(${MPFR_INCLUDE_DIR})
     add_executable(transcendental_test transcendental_test.cpp ${KFR_SRC} ${TEST_SRC})
+    target_link_libraries(all_tests ${MPFR_LIBRARIES})
     target_link_libraries(transcendental_test ${MPFR_LIBRARIES})
     if (WIN32)
         target_link_libraries(transcendental_test gmp) # for MSYS
+        target_link_libraries(all_tests gmp) # for MSYS
     endif()
 endif ()
+
+function(add_x86_test NAME FLAGS)
+    separate_arguments(FLAGS)
+    add_executable(all_tests_${NAME} ${ALL_TESTS_CPP} ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp)
+    target_compile_options(all_tests_${NAME} PRIVATE ${FLAGS})
+    target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN)
+    if(MPFR_FOUND)
+        target_link_libraries(all_tests_${NAME} ${MPFR_LIBRARIES})
+        if (WIN32)
+            target_link_libraries(all_tests_${NAME} gmp) # for MSYS
+        endif()
+    endif()
+endfunction()
+
+message(STATUS CMAKE_SYSTEM_PROCESSOR = ${CMAKE_SYSTEM_PROCESSOR})
+
+set(ARCH_TESTS 0)
+
+if (CLANG AND CMAKE_SYSTEM_PROCESSOR MATCHES "([aA][mM][dD]64)")
+    set(ARCH_TESTS 1)
+    add_x86_test(generic "-march=x86-64 -mno-avx -DCMT_FORCE_GENERIC_CPU")
+    add_x86_test(sse2 "-march=x86-64 -msse2")
+    add_x86_test(sse3 "-march=x86-64 -msse3 -mno-avx")
+    add_x86_test(ssse3 "-march=x86-64 -mssse3 -mno-avx")
+    add_x86_test(sse41 "-march=x86-64 -msse4.1 -mno-avx")
+    add_x86_test(avx "-march=x86-64 -msse4.1 -mavx")
+    add_x86_test(avx2 "-march=x86-64 -msse4.1 -mavx2 -mfma")
+    add_x86_test(avx512 "-march=x86-64 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
+endif()
+
 add_executable(dsp_test dsp_test.cpp ${KFR_SRC} ${TEST_SRC})
 add_executable(empty_test empty_test.cpp ${KFR_SRC} ${TEST_SRC})
 add_executable(complex_test complex_test.cpp ${KFR_SRC} ${TEST_SRC})
@@ -56,7 +105,11 @@ add_executable(expression_test expression_test.cpp ${KFR_SRC} ${TEST_SRC})
 
 add_executable(ebu_test ebu_test.cpp ${KFR_SRC} ${TEST_SRC})
 
-if (ARM)
+if(USE_SDE)
+    find_program(EMULATOR "sde")
+    list(APPEND EMULATOR "-skx")
+    list(APPEND EMULATOR "--")
+elseif (ARM)
     find_program(EMULATOR "qemu-arm")
 else ()
     set(EMULATOR "")
@@ -86,4 +139,15 @@ if (NOT IOS)
     endif ()
     add_test(NAME dft_test
             COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/dft_test)
+
+    if(ARCH_TESTS)
+        add_test(NAME generic COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_generic )
+        add_test(NAME sse2    COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse2    )
+        add_test(NAME sse3    COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse3    )
+        add_test(NAME ssse3   COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_ssse3   )
+        add_test(NAME sse41   COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse41   )
+        add_test(NAME avx     COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx     )
+        add_test(NAME avx2    COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx2    )
+        add_test(NAME avx512  COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx512  )
+    endif()
 endif ()
diff --git a/tests/all_tests.cpp b/tests/all_tests.cpp
@@ -0,0 +1,22 @@
+#include <kfr/io/tostring.hpp>
+#include <kfr/testo/testo.hpp>
+#include <kfr/version.hpp>
+#ifdef HAVE_MPFR
+#include "mpfr/mpfrplus.hpp"
+#endif
+
+using namespace kfr;
+
+int main()
+{
+    println(library_version(), " running on ", cpu_runtime());
+    if (get_cpu() < cpu_t::native)
+    {
+        println("CPU is not supported");
+        return -1;
+    }
+#ifdef HAVE_MPFR
+    mpfr::scoped_precision p(128);
+#endif
+    return testo::run_all("");
+}
diff --git a/tests/base_test.cpp b/tests/base_test.cpp
@@ -377,8 +377,10 @@ TEST(test_stat)
     }
 }
 
+#ifndef KFR_NO_MAIN
 int main()
 {
     println(library_version());
     return testo::run_all("", true);
 }
+#endif
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -210,9 +210,11 @@ TEST(static_tests)
     testo::assert_is_same<common_type<complex<int>, double>, complex<double>>();
 }
 
+#ifndef KFR_NO_MAIN
 int main()
 {
     println(library_version());
 
     return testo::run_all("", true);
 }
+#endif
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -14,9 +14,9 @@
 using namespace kfr;
 
 #ifdef KFR_NATIVE_F64
-constexpr ctypes_t<float, double> float_types{};
+constexpr ctypes_t<float, double> dft_float_types{};
 #else
-constexpr ctypes_t<float> float_types{};
+constexpr ctypes_t<float> dft_float_types{};
 #endif
 
 TEST(test_convolve)
@@ -62,7 +62,7 @@ TEST(fft_real)
 
     kfr::univector<float_type, size> in = gen_random_range<float_type>(gen, -1.0, +1.0);
     kfr::univector<kfr::complex<float_type>, size / 2 + 1> out = realdft(in);
-    kfr::univector<float_type, size> rev = irealdft(out) / size;
+    kfr::univector<float_type, size> rev                       = irealdft(out) / size;
     CHECK(rms(rev - in) <= 0.00001f);
 }
 
@@ -71,7 +71,7 @@ TEST(fft_accuracy)
     testo::active_test()->show_progress = true;
     random_bit_generator gen(2247448713, 915890490, 864203735, 2982561);
 
-    testo::matrix(named("type")       = float_types, //
+    testo::matrix(named("type")       = dft_float_types, //
                   named("inverse")    = std::make_tuple(false, true), //
                   named("log2(size)") = make_range(size_t(1), stopsize), //
                   [&gen](auto type, bool inverse, size_t log2size) {
@@ -115,16 +115,18 @@ TEST(fft_accuracy)
 
                           univector<float_type> out2(size, 0.f);
                           dft.execute(out2, out, temp);
-                          out2 = out2 / size;
+                          out2                         = out2 / size;
                           const float_type rms_diff_r2 = rms(in - out2);
                           CHECK(rms_diff_r2 < epsilon * ops);
                       }
                   });
 }
 
+#ifndef KFR_NO_MAIN
 int main()
 {
     println(library_version(), " running on ", cpu_runtime());
 
     return testo::run_all("", true);
 }
+#endif
diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp
@@ -306,8 +306,10 @@ TEST(fir)
     });
 }
 
+#ifndef KFR_NO_MAIN
 int main()
 {
     println(library_version());
     return testo::run_all("", true);
 }
+#endif
diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp
@@ -146,9 +146,11 @@ TEST(partition)
     }
 }
 
+#ifndef KFR_NO_MAIN
 int main()
 {
     println(library_version());
 
     return testo::run_all("", true);
 }
+#endif
diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp
@@ -12,8 +12,8 @@
 
 using namespace kfr;
 
-constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, //
-                   i16x1, i16x2, i16x4, i16x8, i16x16, i16x3, //
+constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64, i8x3, //
+                   i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x3, //
                    i32x1, i32x2, i32x4, i32x8, i32x16, i32x3 //
 #ifdef KFR_NATIVE_I64
                    ,
@@ -22,8 +22,8 @@ constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, //
                    >
     signed_types{};
 
-constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x3, //
-                   u16x1, u16x2, u16x4, u16x8, u16x16, u16x3, //
+constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64, u8x3, //
+                   u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x3, //
                    u32x1, u32x2, u32x4, u32x8, u32x16, u32x3 //
 #ifdef KFR_NATIVE_I64
                    ,
@@ -127,6 +127,16 @@ inline T ref_satsub(T x, T y)
         return result;
 }
 
+TEST(intrin_select)
+{
+    testo::matrix(named("type") = cconcat(signed_types, cconcat(unsigned_types, float_types)), [](auto type) {
+        using Tvec = type_of<decltype(type)>;
+        using T    = subtype<Tvec>;
+        CHECK(kfr::select(make_mask<T>(false), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(2));
+        CHECK(kfr::select(make_mask<T>(true), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(1));
+    });
+}
+
 TEST(intrin_abs)
 {
     testo::assert_is_same<decltype(kfr::abs(1)), int>();
@@ -387,4 +397,10 @@ TEST(intrin_math)
     CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233)));
 }
 
-int main() { return testo::run_all("", false); }
+#ifndef KFR_NO_MAIN
+int main()
+{
+    println(library_version(), " running on ", cpu_runtime());
+    return testo::run_all("", false);
+}
+#endif
diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp
@@ -14,6 +14,8 @@
 
 using namespace kfr;
 
+using vector_types = ctypes_t<f32, f64, f32x2, f32x8, f32x16, f64x2, f64x4, f64x8>;
+
 template <typename T>
 double ulps(T test, const mpfr::number& ref)
 {
@@ -25,15 +27,72 @@ double ulps(T test, const mpfr::number& ref)
                                mpfr::abs(mpfr::number(test) - std::nexttoward(test, HUGE_VALL)));
 }
 
+template <typename T, size_t N>
+double ulps(const vec<T, N>& test, const mpfr::number& ref)
+{
+    double u = 0;
+    for (size_t i = 0; i < N; ++i)
+        u = std::max(u, ulps(test[i], ref));
+    return u;
+}
+
 TEST(test_sin_cos)
 {
-    testo::matrix(named("type")  = ctypes_t<f32, f64>(),
+    testo::matrix(named("type")  = vector_types(),
                   named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
-                      CHECK(ulps(kfr::sin(x), mpfr::sin(x)) < 2.0);
-                      CHECK(ulps(kfr::cos(x), mpfr::cos(x)) < 2.0);
+                      CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
+                      CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
+                  });
+    testo::matrix(named("type") = vector_types(), named("value") = make_range(-100.0, 100.0, 0.5),
+                  [](auto type, double value) {
+                      using T = type_of<decltype(type)>;
+                      const T x(value);
+                      CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
+                      CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
+                  });
+}
+
+TEST(test_tan)
+{
+    testo::matrix(named("type")  = ctypes_t<f32>(),
+                  named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.01),
+                  [](auto type, double value) {
+                      using T = type_of<decltype(type)>;
+                      const T x(value);
+                      CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 2.0);
+                  });
+    testo::matrix(named("type") = ctypes_t<f32>(), named("value") = make_range(-100.0, 100.0, 0.5),
+                  [](auto type, double value) {
+                      using T = type_of<decltype(type)>;
+                      const T x(value);
+                      CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 3.0);
+                  });
+}
+
+TEST(test_asin_acos_atan)
+{
+    testo::matrix(named("type") = vector_types(), named("value") = make_range(-1.0, 1.0, 0.05),
+                  [](auto type, double value) {
+                      using T = type_of<decltype(type)>;
+                      const T x(value);
+                      CHECK(ulps(kfr::asin(x), mpfr::asin(subtype<T>(value))) < 2.0);
+                      CHECK(ulps(kfr::acos(x), mpfr::acos(subtype<T>(value))) < 2.0);
+                      CHECK(ulps(kfr::atan(x), mpfr::atan(subtype<T>(value))) < 2.0);
+                  });
+}
+
+TEST(test_atan2)
+{
+    testo::matrix(named("type") = vector_types(), named("value1") = make_range(-1.0, 1.0, 0.1),
+                  named("value2") = make_range(-1.0, 1.0, 0.1), [](auto type, double value1, double value2) {
+                      using T = type_of<decltype(type)>;
+                      const T x(value1);
+                      const T y(value2);
+                      CHECK(ulps(kfr::atan2(x, y), mpfr::atan2(subtype<T>(value1), subtype<T>(value2))) <
+                            2.0);
                   });
 }
 
@@ -97,8 +156,11 @@ TEST(test_exp10)
                   });
 }
 
+#ifndef KFR_NO_MAIN
 int main()
 {
+    println(library_version(), " running on ", cpu_runtime());
     mpfr::scoped_precision p(128);
     return testo::run_all("");
 }
+#endif

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	CMakeLists.txt	\|	10	++--------
M	examples/CMakeLists.txt	\|	2	+-
M	include/kfr/base/atan.hpp	\|	8	++++----
M	include/kfr/base/basic_expressions.hpp	\|	1	+
M	include/kfr/base/logical.hpp	\|	3	++-
M	include/kfr/base/random.hpp	\|	11	+++++++----
M	include/kfr/base/round.hpp	\|	8	++++----
M	include/kfr/base/select.hpp	\|	2	+-
M	include/kfr/base/simd_clang.hpp	\|	2	+-
M	include/kfr/base/sin_cos.hpp	\|	33	+++++++++++++++++----------------
M	include/kfr/base/tan.hpp	\|	16	++++++++++------
M	include/kfr/base/vec.hpp	\|	73	++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
M	include/kfr/cident.h	\|	4	++++
M	include/kfr/dft/convolution.hpp	\|	133	++++++++++++++-----------------------------------------------------------------
M	include/kfr/dft/dft-src.cpp	\|	179	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	tests/CMakeLists.txt	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
A	tests/all_tests.cpp	\|	22	++++++++++++++++++++++
M	tests/base_test.cpp	\|	2	++
M	tests/complex_test.cpp	\|	2	++
M	tests/dft_test.cpp	\|	12	+++++++-----
M	tests/dsp_test.cpp	\|	2	++
M	tests/expression_test.cpp	\|	2	++
M	tests/intrinsic_test.cpp	\|	26	+++++++++++++++++++++-----
M	tests/transcendental_test.cpp	\|	68	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---