New conversion functions and refactoring - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 73c5446bd8040f030b87da4392e41643e54b6eeb
parent 8de4b0fe7e8c96548de8dd62c698399fdd58f8e6
Author: samuriddle@gmail.com <samuriddle@gmail.com>
Date:   Sun,  7 Aug 2016 07:09:20 +0300

New conversion functions and refactoring

Diffstat:
M include/kfr/base/asin_acos.hpp  | 4 ++--
M include/kfr/base/complex.hpp  | 66 +++++++++++++++++++++++++++++++++++-------------------------------
M include/kfr/base/expression.hpp  | 15 +++++++--------
M include/kfr/base/generators.hpp  | 10 +++++-----
M include/kfr/base/log_exp.hpp  | 2 +-
M include/kfr/base/operators.hpp  | 115 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M include/kfr/base/pointer.hpp  | 2 +-
M include/kfr/base/random.hpp  | 8 ++++----
M include/kfr/base/round.hpp  | 20 ++++++++++++++++++++
M include/kfr/base/select.hpp  | 7 +++----
M include/kfr/base/types.hpp  | 18 ++++++++++++++----
M include/kfr/base/univector.hpp  | 6 +++---
M include/kfr/base/vec.hpp  | 256 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M include/kfr/cometa.hpp  | 20 ++++++++++++--------
M include/kfr/dft/ft.hpp  | 3 +++
M include/kfr/dsp/biquad.hpp  | 135 +++++++++++++++++++++++++++++++++++++------------------------------------------
M include/kfr/dsp/fir.hpp  | 8 ++++++--
M include/kfr/dsp/goertzel.hpp  | 6 +++---
M include/kfr/dsp/units.hpp  | 2 +-
M include/kfr/dsp/window.hpp  | 26 +++++++++++++-------------
M include/kfr/io/file.hpp  | 4 ++--
M tests/complex_test.cpp  | 6 +++---

22 files changed, 417 insertions(+), 322 deletions(-)
diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp
@@ -36,14 +36,14 @@ namespace intrinsics
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> asin(const vec<T, N>& x)
 {
-    const vec<Tout, N> xx = cast<Tout>(x);
+    const vec<Tout, N> xx = x;
     return atan2(xx, sqrt(Tout(1) - xx * xx));
 }
 
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> acos(const vec<T, N>& x)
 {
-    const vec<Tout, N> xx = cast<Tout>(x);
+    const vec<Tout, N> xx = x;
     return atan2(sqrt(Tout(1) - xx * xx), xx);
 }
 KFR_I_CONVERTER(asin)
diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp
@@ -100,10 +100,12 @@ namespace cometa
 template <typename T>
 struct compound_type_traits<kfr::complex<T>>
 {
-    constexpr static size_t width   = 2;
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar = false;
+    constexpr static size_t width      = 2;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::complex<U>;
     template <typename U>
@@ -157,21 +159,21 @@ struct vec_op<complex<T>> : private vec_op<T>
 template <typename T, size_t N>
 KFR_INLINE vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x)
 {
-    return subcast<complex<T>>(dupeven(subcast<T>(x)));
+    return compcast<complex<T>>(dupeven(compcast<T>(x)));
 }
 KFR_FN(cdupreal)
 
 template <typename T, size_t N>
 KFR_INLINE vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x)
 {
-    return subcast<complex<T>>(dupodd(subcast<T>(x)));
+    return compcast<complex<T>>(dupodd(compcast<T>(x)));
 }
 KFR_FN(cdupimag)
 
 template <typename T, size_t N>
 KFR_INLINE vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x)
 {
-    return subcast<complex<T>>(swap<2>(subcast<T>(x)));
+    return compcast<complex<T>>(swap<2>(compcast<T>(x)));
 }
 KFR_FN(cswapreim)
 
@@ -205,41 +207,43 @@ template <typename T>
 struct is_complex_impl<complex<T>> : std::true_type
 {
 };
-}
-
-// real to complex
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<From, N>& value) noexcept
-{
-    const vec<subtype<To>, N> casted = cast<subtype<To>>(value);
-    return subcast<To>(interleave(casted, zerovector(casted)));
-}
 
-// complex to complex
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept
+// vector<complex> to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<complex<From>, N>>
 {
-    return subcast<To>(cast<subtype<To>>(subcast<From>(value)));
-}
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<complex<To>, N> cast(const vec<complex<From>, N>& value)
+    {
+        return builtin_convertvector<complex<To>>(value);
+    }
+};
 
-// complex to real
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(!internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept
+// vector to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<From, N>>
 {
-    static_assert(sizeof(To) == 0, "Can't cast complex to real");
-    return {};
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<complex<To>, N> cast(const vec<From, N>& value)
+    {
+        const vec<To, N> casted = static_cast<vec<To, N>>(value);
+        return *interleave(casted, zerovector(casted));
+    }
+};
 }
 
 template <typename T, size_t N>
 constexpr KFR_INLINE vec<complex<T>, N / 2> ccomp(const vec<T, N>& x)
 {
-    return subcast<complex<T>>(x);
+    return compcast<complex<T>>(x);
 }
 
 template <typename T, size_t N>
 constexpr KFR_INLINE vec<T, N * 2> cdecom(const vec<complex<T>, N>& x)
 {
-    return subcast<T>(x);
+    return compcast<T>(x);
 }
 
 template <typename T>
@@ -250,7 +254,7 @@ constexpr KFR_INLINE T real(const complex<T>& value)
 template <typename T, size_t N>
 constexpr KFR_INLINE vec<T, N> real(const vec<complex<T>, N>& value)
 {
-    return even(subcast<T>(value));
+    return even(compcast<T>(value));
 }
 
 template <typename T>
@@ -273,7 +277,7 @@ constexpr KFR_INLINE T imag(const complex<T>& value)
 template <typename T, size_t N>
 constexpr KFR_INLINE vec<T, N> imag(const vec<complex<T>, N>& value)
 {
-    return odd(subcast<T>(value));
+    return odd(compcast<T>(value));
 }
 KFR_FN(imag)
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
@@ -285,7 +289,7 @@ KFR_INLINE internal::expression_function<fn_imag, E1> imag(E1&& x)
 template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>>
 constexpr KFR_INLINE vec<complex<T>, N> make_complex(const vec<T1, N>& real, const vec<T2, N>& imag = T2(0))
 {
-    return subcast<complex<T>>(interleave(cast<T>(real), cast<T>(imag)));
+    return compcast<complex<T>>(interleave(cast<T>(real), cast<T>(imag)));
 }
 
 template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>>
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -99,8 +99,7 @@ protected:
 
 private:
     template <typename Arg, size_t N, typename Tin,
-              typename Tout1 = conditional<is_generic<Arg>::value, Tin, typename decay<Arg>::value_type>,
-              typename Tout  = Tout1>
+              typename Tout = conditional<is_generic<Arg>::value, Tin, value_type_of<Arg>>>
     KFR_INLINE vec_t<Tout, N> vec_t_for() const
     {
         return {};
@@ -112,8 +111,8 @@ private:
         constexpr size_t Nin = N * ratio::input / ratio::output;
         using Tout = conditional<is_same<generic, value_type>::value, T, common_type<T, value_type>>;
 
-        return cast<T>(fn(cast<Tout>(std::get<indices>(this->args)(
-            cinput, index * ratio::input / ratio::output, vec_t_for<Args, Nin, Tout>()))...));
+        return fn(std::get<indices>(this->args)(cinput, index * ratio::input / ratio::output,
+                                                vec_t_for<Args, Nin, Tout>())...);
     }
     template <size_t... indices>
     KFR_INLINE void begin_block_impl(size_t size, csizes_t<indices...>)
@@ -149,7 +148,7 @@ struct expression_scalar : input_expression
     template <typename U, size_t N>
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N>) const
     {
-        return resize<N>(cast<U>(val));
+        return resize<N>(static_cast<vec<U, width>>(val));
     }
 };
 
@@ -204,7 +203,7 @@ KFR_INLINE void process_cycle(OutFn&& outfn, const Fn& fn, size_t& i, size_t siz
     KFR_LOOP_NOUNROLL
     for (; i < count; i += width)
     {
-        outfn(coutput, i, cast<Tout>(fn(cinput, i, vec_t<Tin, width>())));
+        outfn(coutput, i, fn(cinput, i, vec_t<Tin, width>()));
     }
 }
 }
@@ -269,7 +268,7 @@ struct expressoin_typed : input_expression
     template <typename U, size_t N>
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
-        return cast<U>(e1(cinput, index, vec_t<T, N>()));
+        return e1(cinput, index, vec_t<T, N>());
     }
     E1 e1;
 };
@@ -286,7 +285,7 @@ struct expressoin_sized : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         auto val = e1(cinput, index, vec_t<T, N>());
-        return cast<U>(val);
+        return val;
     }
 
     constexpr size_t size() const noexcept { return m_size; }
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -43,7 +43,7 @@ struct generator : input_expression
     template <typename U, size_t N>
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N> t) const
     {
-        return cast<U>(generate(t));
+        return generate(t);
     }
 
     void resync(T start) const { ptr_cast<Class>(this)->sync(start); }
@@ -108,7 +108,7 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
 struct generator_exp : generator<T, width, generator_exp<T, width>>
 {
     generator_exp(T start, T step) noexcept : step(step), vstep(exp(make_vector(step* width))[0] - 1)
@@ -125,7 +125,7 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
 struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
 {
     generator_exp2(T start, T step) noexcept : step(step), vstep(exp2(make_vector(step* width))[0] - 1)
@@ -142,7 +142,7 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
 struct generator_cossin : generator<T, width, generator_cossin<T, width>>
 {
     generator_cossin(T start, T step)
@@ -167,7 +167,7 @@ protected:
     }
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(2, 4)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(2, 4), KFR_ARCH_DEP>
 struct generator_sin : generator<T, width, generator_sin<T, width>>
 {
     generator_sin(T start, T step)
diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp
@@ -81,7 +81,7 @@ KFR_SINTRIN vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q)
 template <typename T, size_t N>
 KFR_SINTRIN vec<T, N> logb(const vec<T, N>& x)
 {
-    return select(x == T(), -c_infinity<T>, cast<T>(vilogbp1(x) - 1));
+    return select(x == T(), -c_infinity<T>, static_cast<vec<T, N>>(vilogbp1(x) - 1));
 }
 
 template <size_t N>
diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp
@@ -81,9 +81,7 @@ KFR_INLINE internal::expression_function<fn_add, E1, E2> add(E1&& x, E2&& y)
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
 KFR_INLINE internal::expression_function<fn_add, E1> add(E1&& x, E2&& y, E3&& z)
 {
-    return { fn_add(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z)
-
-    };
+    return { fn_add(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
 }
 
 template <typename T1, typename T2>
@@ -101,9 +99,7 @@ KFR_FN(sub)
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
 KFR_INLINE internal::expression_function<fn_sub, E1, E2> sub(E1&& x, E2&& y)
 {
-    return { fn_sub(), std::forward<E1>(x), std::forward<E2>(y)
-
-    };
+    return { fn_sub(), std::forward<E1>(x), std::forward<E2>(y) };
 }
 
 template <typename T1>
@@ -111,10 +107,10 @@ constexpr inline T1 mul(T1 x)
 {
     return x;
 }
-template <typename T1, typename T2, typename... Ts>
-constexpr inline common_type<T1, T2, Ts...> mul(T1 x, T2 y, Ts... rest)
+template <typename T1, typename T2, typename... Ts, typename Tout = common_type<T1, T2, Ts...>>
+constexpr inline Tout mul(T1 x, T2 y, Ts... rest)
 {
-    return x * mul(std::forward<T2>(y), std::forward<Ts>(rest)...);
+    return static_cast<Tout>(x) * static_cast<Tout>(mul(std::forward<T2>(y), std::forward<Ts>(rest)...));
 }
 
 template <typename T>
@@ -156,9 +152,7 @@ KFR_FN(cub)
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
 KFR_INLINE internal::expression_function<fn_cub, E1> cub(E1&& x)
 {
-    return { fn_cub(), std::forward<E1>(x)
-
-    };
+    return { fn_cub(), std::forward<E1>(x) };
 }
 
 template <typename T>
@@ -192,30 +186,22 @@ KFR_FN(pow5)
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
 KFR_INLINE internal::expression_function<fn_pow2, E1> pow2(E1&& x)
 {
-    return { fn_pow2(), std::forward<E1>(x)
-
-    };
+    return { fn_pow2(), std::forward<E1>(x) };
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
 KFR_INLINE internal::expression_function<fn_pow3, E1> pow3(E1&& x)
 {
-    return { fn_pow3(), std::forward<E1>(x)
-
-    };
+    return { fn_pow3(), std::forward<E1>(x) };
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
 KFR_INLINE internal::expression_function<fn_pow4, E1> pow4(E1&& x)
 {
-    return { fn_pow4(), std::forward<E1>(x)
-
-    };
+    return { fn_pow4(), std::forward<E1>(x) };
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
 KFR_INLINE internal::expression_function<fn_pow5, E1> pow5(E1&& x)
 {
-    return { fn_pow5(), std::forward<E1>(x)
-
-    };
+    return { fn_pow5(), std::forward<E1>(x) };
 }
 
 /// Raise x to the power base $x^{base}$
@@ -265,24 +251,24 @@ KFR_FN(sqrsum)
 KFR_FN(sqrdiff)
 
 /// Division
-template <typename T1, typename T2>
-inline common_type<T1, T2> div(T1 x, T2 y)
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+inline Tout div(const T1& x, const T2& y)
 {
-    return x / y;
+    return static_cast<Tout>(x) / static_cast<Tout>(y);
 }
 KFR_FN(div)
 
 /// Remainder
-template <typename T1, typename T2>
-inline common_type<T1, T2> rem(T1 x, T2 y)
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+inline Tout rem(const T1& x, const T2& y)
 {
-    return x % y;
+    return static_cast<Tout>(x) % static_cast<Tout>(y);
 }
 KFR_FN(rem)
 
 /// Negation
 template <typename T1>
-inline T1 neg(T1 x)
+inline T1 neg(const T1& x)
 {
     return -x;
 }
@@ -290,7 +276,7 @@ KFR_FN(neg)
 
 /// Bitwise Not
 template <typename T1>
-inline T1 bitwisenot(T1 x)
+inline T1 bitwisenot(const T1& x)
 {
     return ~x;
 }
@@ -499,26 +485,6 @@ constexpr KFR_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
     return (x & internal::highbitmask<T>) | (y & internal::highbitmask<T>);
 }
 
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_INLINE vec<T, N> fmod(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x - cast<itype<T>>(x / y) * y;
-}
-
-KFR_FN_S(fmod)
-KFR_FN(fmod)
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-constexpr KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x % y;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return fmod(x, y);
-}
-
 template <typename T, size_t N>
 KFR_INLINE mask<T, N> isnan(const vec<T, N>& x)
 {
@@ -695,4 +661,49 @@ KFR_EXPR_BINARY(fn_less, <)
 KFR_EXPR_BINARY(fn_greater, >)
 KFR_EXPR_BINARY(fn_lessorequal, <=)
 KFR_EXPR_BINARY(fn_greaterorequal, >=)
+
+template <typename T, size_t N1, size_t... Ns>
+vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest)
+{
+    const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...));
+    return compcast<vec<T, sizeof...(Ns) + 1>>(t);
+}
+
+KFR_FN(packtranspose)
+
+namespace internal
+{
+template <typename... E>
+struct expression_pack : expression<E...>, output_expression
+{
+    expression_pack(E&&... e) : expression<E...>(std::forward<E>(e)...) {}
+    using value_type = vec<common_type<value_type_of<E>...>, sizeof...(E)>;
+    using size_type  = typename expression<E...>::size_type;
+    constexpr size_type size() const noexcept { return expression<E...>::size(); }
+
+    template <typename U, size_t N>
+    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
+    {
+        return this->call(fn_packtranspose(), index, x);
+    }
+    template <typename U, size_t N>
+    KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+    {
+        output(index, x, csizeseq<sizeof...(E)>);
+    }
+
+private:
+    template <typename U, size_t N, size_t... indices>
+    void output(size_t index, const vec<U, N>& x, csizes_t<indices...>)
+    {
+        swallow{ (std::get<indices>(this->args)(coutput, index, x[indices]), void(), 0)... };
+    }
+};
+}
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+internal::expression_pack<internal::arg<E>...> pack(E&&... e)
+{
+    return internal::expression_pack<internal::arg<E>...>(std::forward<E>(e)...);
+}
 }
diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp
@@ -82,7 +82,7 @@ struct expression_pointer : input_expression
         constexpr size_t findex = ilog2(N);
         static_assert(N <= maxwidth, "N is greater than maxwidth");
         func_t func = reinterpret_cast<func_t>(vtable->get(csize<2 + findex>));
-        vec<U, N> result = cast<U>(func(instance, index));
+        vec<U, N> result = func(instance, index);
         return result;
     }
     KFR_INLINE void begin_block(size_t size) const
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -114,8 +114,8 @@ inline enable_if_not_f<vec<T, N>> random_range(random_bit_generator& gen, T min,
     using big_type = findinttype<sqr(std::numeric_limits<T>::min()), sqr(std::numeric_limits<T>::max())>;
 
     vec<T, N> u                = random_uniform<T, N>(gen);
-    const vec<big_type, N> tmp = cast<big_type>(u);
-    return cast<T>((tmp * (max - min) + min) >> typebits<T>::bits);
+    const vec<big_type, N> tmp = u;
+    return (tmp * (max - min) + min) >> typebits<T>::bits;
 }
 
 namespace internal
@@ -128,7 +128,7 @@ struct expression_random_uniform : input_expression
     template <typename U, size_t N>
     vec<U, N> operator()(cinput_t, size_t, vec_t<U, N>) const
     {
-        return cast<U>(random_uniform<T, N>(gen));
+        return random_uniform<T, N>(gen);
     }
     mutable random_bit_generator gen;
 };
@@ -146,7 +146,7 @@ struct expression_random_range : input_expression
     template <typename U, size_t N>
     vec<U, N> operator()(cinput_t, size_t, vec_t<U, N>) const
     {
-        return cast<U>(random_range<N, T>(gen, min, max));
+        return random_range<N, T>(gen, min, max);
     }
     mutable random_bit_generator gen;
     const T min;
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -318,6 +318,26 @@ KFR_INTRIN internal::expression_function<fn::itrunc, E1> itrunc(E1&& x)
 {
     return { fn::itrunc(), std::forward<E1>(x) };
 }
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INLINE vec<T, N> fmod(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x - trunc(x / y) * y;
+}
+
+KFR_FN_S(fmod)
+KFR_FN(fmod)
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+constexpr KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x % y;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return fmod(x, y);
+}
 }
 
 #undef KFR_mm_trunc_ps
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -180,9 +180,9 @@ KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<
 
 // fallback
 template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> select(mask<T, N> m, const vec<T, N>& x, const vec<T, N>& y)
+KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
 {
-    return y ^ ((x ^ y) & m);
+    return y ^ ((x ^ y) & m.asvec());
 }
 #endif
 }
@@ -193,8 +193,7 @@ template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_nume
 KFR_INTRIN vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
 {
     static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
-    return intrinsics::select(bitcast<Tout>(m).asmask(), static_cast<vec<Tout, N>>(x),
-                              static_cast<vec<Tout, N>>(y));
+    return intrinsics::select(bitcast<Tout>(m), static_cast<vec<Tout, N>>(x), static_cast<vec<Tout, N>>(y));
 }
 
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -349,6 +349,8 @@ enum class cpu_t : int
     runtime = -1,
 };
 
+#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native
+
 template <cpu_t cpu>
 using ccpu_t = cval_t<cpu_t, cpu>;
 
@@ -578,6 +580,12 @@ constexpr inline static const T* derived_cast(const U* ptr)
     return static_cast<const T*>(ptr);
 }
 
+template <typename T, typename U>
+constexpr inline static T implicit_cast(U&& value)
+{
+    return std::forward<T>(value);
+}
+
 #pragma clang diagnostic pop
 
 __attribute__((unused)) static const char* cpu_name(cpu_t set)
@@ -781,10 +789,12 @@ namespace cometa
 template <typename T, size_t N>
 struct compound_type_traits<kfr::vec_t<T, N>>
 {
-    constexpr static size_t width   = N;
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar = false;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
 
     template <typename U>
     using rebind = kfr::vec_t<U, N>;
diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp
@@ -43,13 +43,13 @@ struct univector_base : input_expression, output_expression
     KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
     {
         T* data = derived_cast<Class>(this)->data();
-        write(ptr_cast<T>(data) + index, cast<T>(value));
+        write(ptr_cast<T>(data) + index, vec<T, N>(value));
     }
     template <typename U, size_t N>
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         const T* data = derived_cast<Class>(this)->data();
-        return cast<U>(read<N>(ptr_cast<T>(data) + index));
+        return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index));
     }
 
     template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
@@ -197,7 +197,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T
     constexpr static bool is_array_ref = true;
     constexpr static bool is_vector    = false;
     constexpr static bool is_aligned   = false;
-    using value_type                   = T;
+    using value_type                   = remove_const<T>;
 
     using univector_base<T, univector>::operator=;
 };
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -99,7 +99,7 @@ struct vec_ptr
 template <typename To, typename From, size_t N,
           KFR_ENABLE_IF(std::is_same<subtype<From>, subtype<To>>::value),
           size_t Nout = N* compound_type_traits<From>::width / compound_type_traits<To>::width>
-constexpr KFR_INLINE vec<To, Nout> subcast(const vec<From, N>& value) noexcept
+constexpr KFR_INLINE vec<To, Nout> compcast(const vec<From, N>& value) noexcept
 {
     return *value;
 }
@@ -154,8 +154,8 @@ template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(is_compound<T>:
 KFR_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...> indices, const vec<T, N>& x,
                                                     const vec<T, N>& y)
 {
-    return subcast<T>(
-        shufflevector(inflate(csize<widthof<T>()>, indices), subcast<subtype<T>>(x), subcast<subtype<T>>(y)));
+    return compcast<T>(shufflevector(inflate(csize<widthof<T>()>, indices), compcast<subtype<T>>(x),
+                                     compcast<subtype<T>>(y)));
 }
 
 template <size_t... Indices, size_t Nout = sizeof...(Indices), typename T, size_t N>
@@ -225,14 +225,90 @@ constexpr swiz<14> s14{};
 constexpr swiz<15> s15{};
 }
 
-template <typename To, typename From, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To cast(From value) noexcept
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+
+template <size_t N, typename T>
+constexpr KFR_INLINE vec<T, N> broadcast(T x)
+{
+    return (simd<T, N>)(x);
+}
+
+#pragma clang diagnostic pop
+
+namespace internal
+{
+
+template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>,
+          size_t Nout = N* compound_type_traits<To>::deep_width>
+constexpr KFR_INLINE vec<To, N> builtin_convertvector(const vec<From, N>& value) noexcept
 {
-    return static_cast<To>(value);
+    return __builtin_convertvector(*value, simd<Tsub, Nout>);
 }
-template <typename To, typename From, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To bitcast(From value) noexcept
+
+// scalar to scalar
+template <typename To, typename From>
+struct conversion
 {
+    static_assert(std::is_convertible<From, To>::value, "");
+    static To cast(const From& value) { return value; }
+};
+
+// vector to vector
+template <typename To, typename From, size_t N>
+struct conversion<vec<To, N>, vec<From, N>>
+{
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<To, N> cast(const vec<From, N>& value) { return builtin_convertvector<To>(value); }
+};
+
+// vector<vector> to vector<vector>
+template <typename To, typename From, size_t N1, size_t N2>
+struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, N1>, N2>>
+{
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value)
+    {
+        return builtin_convertvector<vec<To, N1>>(value);
+    }
+};
+
+// scalar to vector
+template <typename To, typename From, size_t N>
+struct conversion<vec<To, N>, From>
+{
+    static_assert(std::is_convertible<From, To>::value, "");
+    static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
+};
+
+// mask to mask
+template <typename To, typename From, size_t N>
+struct conversion<mask<To, N>, mask<From, N>>
+{
+    static_assert(sizeof(To) == sizeof(From), "");
+    static mask<To, N> cast(const mask<From, N>& value) { return reinterpret_cast<simd<To, N>>(*value); }
+};
+}
+
+template <typename From, size_t N, typename Tsub = deep_subtype<From>,
+          size_t Nout = N * sizeof(From) / sizeof(Tsub)>
+constexpr KFR_INLINE vec<Tsub, Nout> flatten(const vec<From, N>& value) noexcept
+{
+    return *value;
+}
+
+template <typename To, typename From, typename Tout = deep_rebind<From, To>>
+constexpr KFR_INLINE Tout cast(const From& value) noexcept
+{
+    return static_cast<Tout>(value);
+}
+
+template <typename To, typename From>
+constexpr KFR_INLINE To bitcast(const From& value) noexcept
+{
+    static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types");
     union {
         From from;
         To to;
@@ -240,43 +316,34 @@ constexpr KFR_INLINE To bitcast(From value) noexcept
     return u.to;
 }
 
-template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To ubitcast(From value) noexcept
+template <typename To, typename From, size_t N, size_t Nout = N * sizeof(From) / sizeof(To)>
+constexpr KFR_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
 {
-    return bitcast<To>(value);
+    return reinterpret_cast<typename vec<To, Nout>::simd_t>(*value);
 }
 
-template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To ibitcast(From value) noexcept
+template <typename To, typename From, size_t N, size_t Nout = N * sizeof(From) / sizeof(To)>
+constexpr KFR_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept
 {
-    return bitcast<To>(value);
+    return reinterpret_cast<typename mask<To, Nout>::simd_t>(*value);
 }
 
-template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To fbitcast(From value) noexcept
+template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INLINE To ubitcast(const From& value) noexcept
 {
     return bitcast<To>(value);
 }
 
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(!is_compound<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<From, N>& value) noexcept
-{
-    return __builtin_convertvector(*value, simd<To, N>);
-}
-template <typename To, typename From, simdindex N>
-constexpr KFR_INLINE simd<To, N> cast(const simd<From, N>& value) noexcept
-{
-    return __builtin_convertvector(value, simd<To, N>);
-}
-template <typename To, typename From, size_t N, size_t Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
+template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INLINE To ibitcast(const From& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return bitcast<To>(value);
 }
-template <typename To, typename From, simdindex N, simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> bitcast(const simd<From, N>& value) noexcept
+
+template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INLINE To fbitcast(const From& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(value);
+    return bitcast<To>(value);
 }
 
 template <typename From, size_t N, typename To = utype<From>, size_t Nout = sizeof(From) * N / sizeof(To)>
@@ -297,27 +364,6 @@ constexpr KFR_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
     return reinterpret_cast<simd<To, Nout>>(*value);
 }
 
-template <typename From, simdindex N, typename To = utype<From>,
-          simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> ubitcast(const simd<From, N>& value) noexcept
-{
-    return reinterpret_cast<simd<To, Nout>>(value);
-}
-
-template <typename From, simdindex N, typename To = itype<From>,
-          simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> ibitcast(const simd<From, N>& value) noexcept
-{
-    return reinterpret_cast<simd<To, Nout>>(value);
-}
-
-template <typename From, simdindex N, typename To = ftype<From>,
-          simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> fbitcast(const simd<From, N>& value) noexcept
-{
-    return reinterpret_cast<simd<To, Nout>>(value);
-}
-
 constexpr KFR_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
 
 template <typename T, size_t N, size_t... Sizes, size_t Nout = N + csum(csizes<Sizes...>)>
@@ -345,17 +391,6 @@ KFR_INLINE vec<T, Nout> repeat(const vec<T, N>& x)
 }
 KFR_FN(repeat)
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-
-template <size_t N, typename T>
-constexpr KFR_INLINE vec<T, N> broadcast(T x)
-{
-    return (simd<T, N>)(x);
-}
-
-#pragma clang diagnostic pop
-
 template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)>
 KFR_INLINE vec<T, Nout> resize(const vec<T, N>& x)
 {
@@ -422,7 +457,8 @@ private:
 template <typename T>
 struct vec_op
 {
-    using scalar_type = subtype<T>;
+    using scalar_type  = subtype<T>;
+    using uscalar_type = utype<scalar_type>;
 
     template <simdindex N>
     constexpr static simd<scalar_type, N> add(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
@@ -467,53 +503,56 @@ struct vec_op
     template <simdindex N>
     constexpr static simd<scalar_type, N> band(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(ubitcast(x) & ubitcast(y));
+        return reinterpret_cast<simd<scalar_type, N>>(reinterpret_cast<simd<uscalar_type, N>>(x) &
+                                                      reinterpret_cast<simd<uscalar_type, N>>(y));
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> bor(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(ubitcast(x) | ubitcast(y));
+        return reinterpret_cast<simd<scalar_type, N>>(reinterpret_cast<simd<uscalar_type, N>>(x) |
+                                                      reinterpret_cast<simd<uscalar_type, N>>(y));
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> bxor(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(ubitcast(x) ^ ubitcast(y));
+        return reinterpret_cast<simd<scalar_type, N>>(reinterpret_cast<simd<uscalar_type, N>>(x) ^
+                                                      reinterpret_cast<simd<uscalar_type, N>>(y));
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> bnot(simd<scalar_type, N> x) noexcept
     {
-        return bitcast<scalar_type>(~ubitcast(x));
+        return reinterpret_cast<simd<scalar_type, N>>(~reinterpret_cast<simd<uscalar_type, N>>(x));
     }
 
     template <simdindex N>
     constexpr static simd<scalar_type, N> eq(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x == y);
+        return reinterpret_cast<simd<scalar_type, N>>(x == y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> ne(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x != y);
+        return reinterpret_cast<simd<scalar_type, N>>(x != y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> lt(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x < y);
+        return reinterpret_cast<simd<scalar_type, N>>(x < y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> gt(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x > y);
+        return reinterpret_cast<simd<scalar_type, N>>(x > y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> le(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x <= y);
+        return reinterpret_cast<simd<scalar_type, N>>(x <= y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> ge(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x >= y);
+        return reinterpret_cast<simd<scalar_type, N>>(x >= y);
     }
 };
 
@@ -554,7 +593,8 @@ constexpr KFR_INLINE vec<T, N> make_vector(cvals_t<T, Values...>)
 KFR_FN(make_vector)
 
 template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
-          typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>
+          typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>,
+          KFR_ENABLE_IF(is_numeric<SubType>::value)>
 constexpr KFR_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest)
 {
     return internal::make_vector_impl<SubType>(csizeseq<N * widthof<SubType>()>, static_cast<SubType>(x),
@@ -567,6 +607,7 @@ struct vec : vec_t<T, N>
 {
     static_assert(N > 0 && N <= 256, "Invalid vector size");
 
+    using UT          = utype<T>;
     using value_type  = T;
     using scalar_type = subtype<T>;
     constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; }
@@ -582,6 +623,10 @@ struct vec : vec_t<T, N>
         : v(*internal_read_write::read<N, false>(value.data()))
     {
     }
+    constexpr KFR_INLINE vec(const array_ref<const T>& value) noexcept
+        : v(*internal_read_write::read<N, false>(value.data()))
+    {
+    }
     template <typename U,
               KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width > 1)>
     constexpr KFR_INLINE vec(const U& value) noexcept
@@ -690,10 +735,10 @@ struct vec : vec_t<T, N>
     using array_t = T (&)[N];
     KFR_INLINE array_t arr() { return ref_cast<array_t>(v); }
 
-    template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value)>
+    template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value && !std::is_same<U, vec>::value)>
     constexpr operator vec<U, N>() const noexcept
     {
-        return cast<U>(*this);
+        return internal::conversion<vec<U, N>, vec<T, N>>::cast(*this);
     }
 
 private:
@@ -730,6 +775,7 @@ private:
 template <typename T, size_t N>
 struct mask : public vec<T, N>
 {
+    using UT                      = utype<T>;
     using type                    = T;
     constexpr static size_t width = N;
 
@@ -758,23 +804,19 @@ struct mask : public vec<T, N>
     {
     }
 
-    //    template <typename M, typename = u8[sizeof(T) == sizeof(M)]>
-    //    constexpr KFR_INLINE mask(mask<M, N> value) : base(reinterpret_cast<const vec<T, N>&>(value))
-    //    {
-    //    }
-    constexpr KFR_INLINE mask operator~() const { return bitcast<T>(~ubitcast(this->v)); }
-    constexpr KFR_INLINE mask operator&(const vec<T, N>& x) const
+    friend constexpr KFR_INLINE mask operator&(const mask& x, const mask& y)
     {
-        return bitcast<T>(ubitcast(this->v) & ubitcast(x.v));
+        return vec_op<T>::band(x.v, y.v);
     }
-    constexpr KFR_INLINE mask operator|(const vec<T, N>& x) const
+    friend constexpr KFR_INLINE mask operator|(const mask& x, const mask& y)
     {
-        return bitcast<T>(ubitcast(this->v) | ubitcast(x.v));
+        return vec_op<T>::bor(x.v, y.v);
     }
-    constexpr KFR_INLINE mask operator^(const vec<T, N>& x) const
+    friend constexpr KFR_INLINE mask operator^(const mask& x, const mask& y)
     {
-        return bitcast<T>(ubitcast(this->v) ^ ubitcast(x.v));
+        return vec_op<T>::bxor(x.v, y.v);
     }
+    friend constexpr KFR_INLINE mask operator~(const mask& x) { return vec_op<T>::bnot(x.v); }
 
     constexpr KFR_INLINE mask operator&&(const mask& x) const { return *this & x; }
     constexpr KFR_INLINE mask operator||(const mask& x) const { return *this | x; }
@@ -794,8 +836,8 @@ struct mask : public vec<T, N>
     KFR_INLINE bool operator[](size_t index) const { return ibitcast(this->v[index]) < 0; }
 };
 
-template <typename T, size_t N>
-using cvec = vec<T, N * 2>;
+template <typename T, size_t N1, size_t N2 = N1>
+using mat = vec<vec<T, N1>, N2>;
 
 namespace internal
 {
@@ -1171,7 +1213,7 @@ template <typename T, size_t N>
 constexpr KFR_INLINE vec<T, N> zerovector()
 {
     constexpr size_t width = N * compound_type_traits<T>::width;
-    return subcast<T>(vec<subtype<T>, width>(simd<subtype<T>, width>()));
+    return compcast<T>(vec<subtype<T>, width>(simd<subtype<T>, width>()));
 }
 
 template <typename T, size_t N>
@@ -1285,10 +1327,12 @@ namespace cometa
 template <typename T, size_t N>
 struct compound_type_traits<kfr::simd<T, N>>
 {
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static size_t width   = N;
-    constexpr static bool is_scalar = false;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::simd<U, N>;
     template <typename U>
@@ -1300,10 +1344,12 @@ struct compound_type_traits<kfr::simd<T, N>>
 template <typename T, size_t N>
 struct compound_type_traits<kfr::vec<T, N>>
 {
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static size_t width   = N;
-    constexpr static bool is_scalar = false;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::vec<U, N>;
     template <typename U>
@@ -1315,10 +1361,12 @@ struct compound_type_traits<kfr::vec<T, N>>
 template <typename T, size_t N>
 struct compound_type_traits<kfr::mask<T, N>>
 {
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static size_t width   = N;
-    constexpr static bool is_scalar = false;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::mask<U, N>;
     template <typename U>
diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp
@@ -135,10 +135,12 @@ constexpr size_t typeindex()
 template <typename T>
 struct compound_type_traits
 {
-    constexpr static size_t width   = 1;
-    using subtype                   = T;
-    using deep_subtype              = T;
-    constexpr static bool is_scalar = true;
+    constexpr static size_t width      = 1;
+    constexpr static size_t deep_width = width;
+    using subtype                      = T;
+    using deep_subtype                 = T;
+    constexpr static size_t depth      = 0;
+    constexpr static bool is_scalar    = true;
 
     template <typename U>
     using rebind = U;
@@ -166,10 +168,12 @@ using deep_rebind = typename compound_type_traits<T>::template deep_rebind<SubTy
 template <typename T>
 struct compound_type_traits<std::pair<T, T>>
 {
-    constexpr static size_t width   = 2;
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar = false;
+    constexpr static size_t width      = 2;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
 
     template <typename U>
     using rebind = std::pair<U, U>;
diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp
@@ -103,6 +103,9 @@ KFR_INLINE vec<T, N> cmul_conj(vec<T, 2> x, vec<T, N> y)
 KFR_FN(cmul_conj)
 KFR_FN(cmul_2conj)
 
+template <typename T, size_t N>
+using cvec = vec<T, N * 2>;
+
 template <size_t N, bool A = false, typename T>
 KFR_INLINE cvec<T, N> cread(const complex<T>* src)
 {
diff --git a/include/kfr/dsp/biquad.hpp b/include/kfr/dsp/biquad.hpp
@@ -74,99 +74,92 @@ struct biquad_params
 
 namespace internal
 {
-template <cpu_t cpu = cpu_t::native>
-struct in_biquad
+template <typename T, size_t filters, KFR_ARCH_DEP>
+struct biquad_block
 {
-private:
-public:
-    template <typename T, size_t filters>
-    struct biquad_block
-    {
-        vec<T, filters> s1;
-        vec<T, filters> s2;
-        vec<T, filters> a1;
-        vec<T, filters> a2;
-        vec<T, filters> b0;
-        vec<T, filters> b1;
-        vec<T, filters> b2;
+    vec<T, filters> s1;
+    vec<T, filters> s2;
+    vec<T, filters> a1;
+    vec<T, filters> a2;
+    vec<T, filters> b0;
+    vec<T, filters> b1;
+    vec<T, filters> b2;
 
-        vec<T, filters> out;
-        biquad_block() : s1(0), s2(0), a1(0), a2(0), b0(1), b1(0), b2(0), out(0) {}
-        biquad_block(const biquad_params<T>* bq, size_t count) : s1(0), s2(0), out(0)
+    vec<T, filters> out;
+    biquad_block() : s1(0), s2(0), a1(0), a2(0), b0(1), b1(0), b2(0), out(0) {}
+    biquad_block(const biquad_params<T>* bq, size_t count) : s1(0), s2(0), out(0)
+    {
+        count = count > filters ? filters : count;
+        for (size_t i = 0; i < count; i++)
         {
-            count = count > filters ? filters : count;
-            for (size_t i = 0; i < count; i++)
-            {
-                a1(i) = bq[i].a1;
-                a2(i) = bq[i].a2;
-                b0(i) = bq[i].b0;
-                b1(i) = bq[i].b1;
-                b2(i) = bq[i].b2;
-            }
-            for (size_t i = count; i < filters; i++)
-            {
-                a1(i) = T(0);
-                a2(i) = T(0);
-                b0(i) = T(1);
-                b1(i) = T(0);
-                b2(i) = T(0);
-            }
+            a1(i) = bq[i].a1;
+            a2(i) = bq[i].a2;
+            b0(i) = bq[i].b0;
+            b1(i) = bq[i].b1;
+            b2(i) = bq[i].b2;
         }
-
-        template <size_t count>
-        biquad_block(const biquad_params<T> (&bq)[count]) : biquad_block(bq, count)
+        for (size_t i = count; i < filters; i++)
         {
-            static_assert(count <= filters, "count > filters");
+            a1(i) = T(0);
+            a2(i) = T(0);
+            b0(i) = T(1);
+            b1(i) = T(0);
+            b2(i) = T(0);
         }
-    };
+    }
 
-    template <size_t filters, typename T, typename E1>
-    struct expression_biquads : public expression<E1>
+    template <size_t count>
+    biquad_block(const biquad_params<T> (&bq)[count]) : biquad_block(bq, count)
     {
-        using value_type = T;
+        static_assert(count <= filters, "count > filters");
+    }
+};
 
-        expression_biquads(const biquad_block<T, filters>& bq, E1&& e1)
-            : expression<E1>(std::forward<E1>(e1)), bq(bq)
-        {
-        }
-        template <size_t width>
-        KFR_INTRIN vec<T, width> operator()(cinput_t, size_t index, vec_t<T, width> t) const
-        {
-            const vec<T, width> in = this->argument_first(index, t);
-            vec<T, width> out;
+template <size_t filters, typename T, typename E1, KFR_ARCH_DEP>
+struct expression_biquads : public expression<E1>
+{
+    using value_type = T;
 
-            KFR_LOOP_UNROLL
-            for (size_t i = 0; i < width; i++)
-            {
-                bq.out = process(bq, insertleft(in[i], bq.out));
-                out(i) = bq.out[filters - 1];
-            }
+    expression_biquads(const biquad_block<T, filters>& bq, E1&& e1)
+        : expression<E1>(std::forward<E1>(e1)), bq(bq)
+    {
+    }
+    template <size_t width>
+    KFR_INTRIN vec<T, width> operator()(cinput_t, size_t index, vec_t<T, width> t) const
+    {
+        const vec<T, width> in = this->argument_first(index, t);
+        vec<T, width> out;
 
-            return out;
-        }
-        KFR_SINTRIN vec<T, filters> process(biquad_block<T, filters>& bq, vec<T, filters> in)
+        KFR_LOOP_UNROLL
+        for (size_t i = 0; i < width; i++)
         {
-            const vec<T, filters> out = bq.b0 * in + bq.s1;
-            bq.s1 = bq.s2 + bq.b1 * in - bq.a1 * out;
-            bq.s2 = bq.b2 * in - bq.a2 * out;
-            return out;
+            bq.out = process(bq, insertleft(in[i], bq.out));
+            out(i) = bq.out[filters - 1];
         }
-        mutable biquad_block<T, filters> bq;
-    };
+
+        return out;
+    }
+    KFR_SINTRIN vec<T, filters> process(biquad_block<T, filters>& bq, vec<T, filters> in)
+    {
+        const vec<T, filters> out = bq.b0 * in + bq.s1;
+        bq.s1 = bq.s2 + bq.b1 * in - bq.a1 * out;
+        bq.s2 = bq.b2 * in - bq.a2 * out;
+        return out;
+    }
+    mutable biquad_block<T, filters> bq;
 };
 }
 
 template <typename T, typename E1>
-KFR_INLINE internal::in_biquad<>::expression_biquads<1, T, internal::arg<E1>> biquad(
-    const biquad_params<T>& bq, E1&& e1)
+KFR_INLINE internal::expression_biquads<1, T, internal::arg<E1>> biquad(const biquad_params<T>& bq, E1&& e1)
 {
     const biquad_params<T> bqs[1] = { bq };
-    return internal::in_biquad<>::expression_biquads<1, T, internal::arg<E1>>(bqs, std::forward<E1>(e1));
+    return internal::expression_biquads<1, T, internal::arg<E1>>(bqs, std::forward<E1>(e1));
 }
 template <size_t filters, typename T, typename E1>
-KFR_INLINE internal::in_biquad<>::expression_biquads<filters, T, internal::arg<E1>> biquad(
+KFR_INLINE internal::expression_biquads<filters, T, internal::arg<E1>> biquad(
     const biquad_params<T> (&bq)[filters], E1&& e1)
 {
-    return internal::in_biquad<>::expression_biquads<filters, T, internal::arg<E1>>(bq, std::forward<E1>(e1));
+    return internal::expression_biquads<filters, T, internal::arg<E1>>(bq, std::forward<E1>(e1));
 }
 }
diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp
@@ -38,7 +38,7 @@ using fir_taps = univector<T, Size>;
 
 namespace internal
 {
-template <size_t tapcount, typename T, typename E1>
+template <size_t tapcount, typename T, typename E1, KFR_ARCH_DEP>
 struct expression_short_fir : expression<E1>
 {
     static_assert(is_poweroftwo(tapcount), "tapcount must be a power of two");
@@ -47,6 +47,10 @@ struct expression_short_fir : expression<E1>
         : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(0)
     {
     }
+    expression_short_fir(E1&& e1, const array_ref<const T>& taps)
+        : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(0)
+    {
+    }
     template <typename U, size_t N>
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
     {
@@ -63,7 +67,7 @@ struct expression_short_fir : expression<E1>
     mutable vec<T, tapcount - 1> delayline;
 };
 
-template <typename T, typename E1>
+template <typename T, typename E1, KFR_ARCH_DEP>
 struct expression_fir : expression<E1>
 {
     expression_fir(E1&& e1, const array_ref<const T>& taps)
diff --git a/include/kfr/dsp/goertzel.hpp b/include/kfr/dsp/goertzel.hpp
@@ -32,7 +32,7 @@ namespace kfr
 namespace internal
 {
 
-template <typename T>
+template <typename T, KFR_ARCH_DEP>
 struct expression_goertzel : output_expression
 {
     expression_goertzel(complex<T>& result, T omega)
@@ -47,7 +47,7 @@ struct expression_goertzel : output_expression
     template <typename U, size_t N>
     KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
     {
-        vec<T, N> in = cast<T>(x);
+        vec<T, N> in = x;
         KFR_LOOP_UNROLL
         for (size_t i = 0; i < N; i++)
         {
@@ -84,7 +84,7 @@ struct expression_parallel_goertzel : output_expression
     template <typename U, size_t N>
     KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
     {
-        const vec<T, N> in = cast<T>(x);
+        const vec<T, N> in = x;
         KFR_LOOP_UNROLL
         for (size_t i = 0; i < N; i++)
         {
diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp
@@ -41,7 +41,7 @@ namespace intrinsics
 template <typename T, typename TF = ftype<T>>
 KFR_SINTRIN TF amp_to_dB(T amp)
 {
-    return log(cast<subtype<TF>>(amp)) * subtype<TF>(8.6858896380650365530225783783322);
+    return log(static_cast<TF>(amp)) * subtype<TF>(8.6858896380650365530225783783322);
     // return T( 20.0 ) * log10( level );
 }
 
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -147,7 +147,7 @@ struct expression_triangular : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(1 - abs(linspace(cinput, index, y)));
+        return 1 - abs(linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -169,7 +169,7 @@ struct expression_bartlett : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(1 - abs(linspace(cinput, index, y)));
+        return 1 - abs(linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -191,7 +191,7 @@ struct expression_cosine : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(sin(c_pi<T> * linspace(cinput, index, y)));
+        return sin(c_pi<T> * linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -213,7 +213,7 @@ struct expression_hann : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y))));
+        return T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y)));
     }
     size_t size() const { return m_size; }
 
@@ -236,7 +236,7 @@ struct expression_bartlett_hann : input_expression
     {
         constexpr vec_t<T, N> y{};
         const vec<T, N> xx = linspace(cinput, index, y);
-        return cast<U>(T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5))));
+        return T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5)));
     }
     size_t size() const { return m_size; }
 
@@ -258,7 +258,7 @@ struct expression_hamming : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(alpha - (1.0 - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y))));
+        return alpha - (1.0 - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y)));
     }
     size_t size() const { return m_size; }
 
@@ -282,7 +282,7 @@ struct expression_bohman : input_expression
     {
         constexpr vec_t<T, N> y{};
         const vec<U, N> n = abs(linspace(cinput, index, y));
-        return cast<U>((T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n));
+        return (T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n);
     }
     size_t size() const { return m_size; }
 
@@ -305,7 +305,7 @@ struct expression_blackman : input_expression
     {
         constexpr vec_t<T, N> y{};
         const vec<T, N> n = linspace(cinput, index, y);
-        return cast<U>(a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n));
+        return a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n);
     }
     size_t size() const { return m_size; }
 
@@ -330,7 +330,7 @@ struct expression_blackman_harris : input_expression
         constexpr vec_t<T, N> y{};
         const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
 
-        return cast<U>(T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n));
+        return T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n);
     }
     size_t size() const { return m_size; }
 
@@ -353,7 +353,7 @@ struct expression_kaiser : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m);
+        return modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m;
     }
     size_t size() const { return m_size; }
 
@@ -383,7 +383,7 @@ struct expression_flattop : input_expression
         constexpr T a2 = 1.29;
         constexpr T a3 = 0.388;
         constexpr T a4 = 0.028;
-        return cast<U>(a0 - a1 * cos(n) + a2 * cos(2 * n) - a3 * cos(3 * n) + a4 * cos(4 * n));
+        return a0 - a1 * cos(n) + a2 * cos(2 * n) - a3 * cos(3 * n) + a4 * cos(4 * n);
     }
     size_t size() const { return m_size; }
 
@@ -405,7 +405,7 @@ struct expression_gaussian : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(exp(-0.5 * sqr(alpha * linspace(cinput, index, y))));
+        return exp(-0.5 * sqr(alpha * linspace(cinput, index, y)));
     }
 
     size_t size() const { return m_size; }
@@ -428,7 +428,7 @@ struct expression_lanczos : input_expression
     KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(sinc(linspace(cinput, index, y)));
+        return sinc(linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
diff --git a/include/kfr/io/file.hpp b/include/kfr/io/file.hpp
@@ -85,7 +85,7 @@ struct expression_file_writer : expression_file_base, output_expression
     {
         if (position != index)
             fseeko(file, static_cast<off_t>(index * sizeof(T)), SEEK_SET);
-        const vec<T, N> output = cast<T>(value);
+        const vec<T, N> output = value;
         fwrite(output.data(), sizeof(T), output.size(), file);
         position = index + N;
     }
@@ -104,7 +104,7 @@ struct expression_file_reader : expression_file_base, input_expression
         vec<T, N> input = qnan;
         fread(input.data(), sizeof(T), input.size(), file);
         position = index + N;
-        return cast<U>(input);
+        return input;
     }
     mutable size_t position = 0;
 };
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -44,17 +44,17 @@ TEST(complex_vector)
 
 TEST(complex_cast)
 {
-    const vec<f32, 4> v1 = subcast<f32>(make_vector(c32{ 0, 1 }, c32{ 2, 3 }));
+    const vec<f32, 4> v1 = bitcast<f32>(make_vector(c32{ 0, 1 }, c32{ 2, 3 }));
     CHECK(v1(0) == 0.f);
     CHECK(v1(1) == 1.f);
     CHECK(v1(2) == 2.f);
     CHECK(v1(3) == 3.f);
 
-    const vec<c32, 1> v2 = subcast<c32>(make_vector(1.f, 2.f));
+    const vec<c32, 1> v2 = bitcast<c32>(make_vector(1.f, 2.f));
     CHECK(v2(0) == 1.f);
     CHECK(v2(1) == 2.f);
 
-    const vec<c32, 2> v3 = cast<c32>(make_vector(1.f, 2.f));
+    const vec<c32, 2> v3 = make_vector(1.f, 2.f);
     CHECK(v3(0) == 1.f);
     CHECK(v3(1) == 0.f);
     CHECK(v3(2) == 2.f);

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	include/kfr/base/asin_acos.hpp	\|	4	++--
M	include/kfr/base/complex.hpp	\|	66	+++++++++++++++++++++++++++++++++++-------------------------------
M	include/kfr/base/expression.hpp	\|	15	+++++++--------
M	include/kfr/base/generators.hpp	\|	10	+++++-----
M	include/kfr/base/log_exp.hpp	\|	2	+-
M	include/kfr/base/operators.hpp	\|	115	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	include/kfr/base/pointer.hpp	\|	2	+-
M	include/kfr/base/random.hpp	\|	8	++++----
M	include/kfr/base/round.hpp	\|	20	++++++++++++++++++++
M	include/kfr/base/select.hpp	\|	7	+++----
M	include/kfr/base/types.hpp	\|	18	++++++++++++++----
M	include/kfr/base/univector.hpp	\|	6	+++---
M	include/kfr/base/vec.hpp	\|	256	+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M	include/kfr/cometa.hpp	\|	20	++++++++++++--------
M	include/kfr/dft/ft.hpp	\|	3	+++
M	include/kfr/dsp/biquad.hpp	\|	135	+++++++++++++++++++++++++++++++++++++------------------------------------------
M	include/kfr/dsp/fir.hpp	\|	8	++++++--
M	include/kfr/dsp/goertzel.hpp	\|	6	+++---
M	include/kfr/dsp/units.hpp	\|	2	+-
M	include/kfr/dsp/window.hpp	\|	26	+++++++++++++-------------
M	include/kfr/io/file.hpp	\|	4	++--
M	tests/complex_test.cpp	\|	6	+++---