Constants refactoring, macros for pragma, MSVC support - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 85000e10217241e44402e582ffe366e1a8932389
parent c9da5fa2939fce901f20629aececfc967673e17c
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue, 11 Oct 2016 03:29:41 +0300

Constants refactoring, macros for pragma, MSVC support

Diffstat:
M CMakeLists.txt  | 3 +++
M examples/biquads.cpp  | 2 +-
M examples/dft.cpp  | 4 +---
M examples/fir.cpp  | 2 +-
M examples/sample_rate_conversion.cpp  | 2 +-
M examples/window.cpp  | 2 +-
M include/kfr/base/abs.hpp  | 8 ++++----
M include/kfr/base/atan.hpp  | 6 ++++--
M include/kfr/base/basic_expressions.hpp  | 22 +++++++++++-----------
M include/kfr/base/complex.hpp  | 40 +++++++++++++++++++++++++++++++++++-----
M include/kfr/base/constants.hpp  | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
M include/kfr/base/cpuid.hpp  | 18 +++++++++++++++++-
M include/kfr/base/digitreverse.hpp  | 16 ++++++++++++----
M include/kfr/base/expression.hpp  | 11 ++++++-----
M include/kfr/base/function.hpp  | 59 ++++++++++++++++++++++++++++-------------------------------
M include/kfr/base/gamma.hpp  | 6 +++---
M include/kfr/base/generators.hpp  | 14 +++++++-------
M include/kfr/base/kfr.h  | 15 +++++++++------
M include/kfr/base/log_exp.hpp  | 26 +++++++++++++-------------
M include/kfr/base/logical.hpp  | 16 ++++++++--------
M include/kfr/base/memory.hpp  | 11 ++++++++---
M include/kfr/base/modzerobessel.hpp  | 89 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M include/kfr/base/operators.hpp  | 16 ++++++++--------
M include/kfr/base/platform.hpp  | 113 +++++++++++++++++++++++++++++++++++--------------------------------------------
M include/kfr/base/pointer.hpp  | 7 ++++---
M include/kfr/base/random.hpp  | 18 ++++++++++++------
M include/kfr/base/read_write.hpp  | 74 +++++++++++++++++++++++++++++++++++++-------------------------------------
M include/kfr/base/reduce.hpp  | 50 +++++++++++++++++++++++++-------------------------
M include/kfr/base/select.hpp  | 8 ++++----
M include/kfr/base/shuffle.hpp  | 30 +++++++++++++-----------------
M include/kfr/base/simd.hpp  | 87 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M include/kfr/base/sin_cos.hpp  | 92 +++++++++++++++++++++++++++++++++++--------------------------------------------
M include/kfr/base/sort.hpp  | 4 ++--
M include/kfr/base/tan.hpp  | 68 ++++++++++++++++++++++++++++++++++----------------------------------
M include/kfr/base/types.hpp  | 62 +++++++++++++-------------------------------------------------
M include/kfr/base/univector.hpp  | 14 ++++++++++----
M include/kfr/base/vec.hpp  | 210 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M include/kfr/cident.h  | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
M include/kfr/cometa.hpp  | 262 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M include/kfr/cometa/array.hpp  | 5 +++++
M include/kfr/cometa/cstring.hpp  | 14 +++++++-------
M include/kfr/cometa/ctti.hpp  | 6 +++---
M include/kfr/cometa/function.hpp  | 2 +-
M include/kfr/cometa/named_arg.hpp  | 5 +++++
M include/kfr/cometa/string.hpp  | 71 +++++++++++++++++++++++++++++++++++++++++------------------------------
M include/kfr/data/sincos.hpp  | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M include/kfr/dft/bitrev.hpp  | 24 ++++++++++++------------
M include/kfr/dft/cache.hpp  | 17 ++++++++++-------
M include/kfr/dft/conv.hpp  | 10 +++++-----
M include/kfr/dft/fft.hpp  | 310 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
M include/kfr/dft/ft.hpp  | 371 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M include/kfr/dsp/biquad.hpp  | 6 +++---
M include/kfr/dsp/delay.hpp  | 2 +-
M include/kfr/dsp/fir.hpp  | 2 +-
M include/kfr/dsp/mixdown.hpp  | 14 +++++++++++---
M include/kfr/dsp/oscillators.hpp  | 43 ++++++++++++++++++++++---------------------
M include/kfr/dsp/sample_rate_conversion.hpp  | 4 ++--
M include/kfr/dsp/units.hpp  | 20 ++++++++++----------
M include/kfr/dsp/window.hpp  | 8 ++++----
M include/kfr/io/audiofile.hpp  | 71 ++++++++++++++++++++++++++++++++++++++++-------------------------------
M include/kfr/io/python_plot.hpp  | 6 +++---
M include/kfr/io/tostring.hpp  | 4 ++++
M tests/CMakeLists.txt  | 21 +++++++++++++--------
M tests/base_test.cpp  | 22 +++++++++++++++-------
M tests/complex_test.cpp  | 86 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M tests/dft_test.cpp  | 2 +-
M tests/dsp_test.cpp  | 13 ++++++-------
M tests/empty_test.cpp  | 2 +-
M tests/expression_test.cpp  | 2 +-
M tests/intrinsic_test.cpp  | 28 ++++++++++++++++++++++++----
M tests/multiarch.cpp  | 2 +-
M tests/testo/testo.hpp  | 24 ++++++++++++------------
M tests/transcendental_test.cpp  | 17 +++++++++--------

73 files changed, 1887 insertions(+), 1362 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -56,6 +56,9 @@ set(CMAKE_C_FLAGS "${OPT_TARGET} ${OPT_BITNESS} ${OPT_STATIC}" CACHE STRING "com
 
 project(kfr)
 
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin/Debug)
+
 include(sources.cmake)
 
 set(ALL_WARNINGS -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-c99-extensions -Wno-padded)
diff --git a/examples/biquads.cpp b/examples/biquads.cpp
@@ -10,7 +10,7 @@
 
 using namespace kfr;
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
diff --git a/examples/dft.cpp b/examples/dft.cpp
@@ -11,7 +11,7 @@
 
 using namespace kfr;
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
@@ -45,7 +45,5 @@ int main(int argc, char** argv)
     println(in);
     println();
     println(dB);
-    (void)argc;
-    (void)argv;
     return 0;
 }
diff --git a/examples/fir.cpp b/examples/fir.cpp
@@ -10,7 +10,7 @@
 
 using namespace kfr;
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
diff --git a/examples/sample_rate_conversion.cpp b/examples/sample_rate_conversion.cpp
@@ -15,7 +15,7 @@ constexpr size_t output_sr = 44100;
 constexpr size_t len       = 96000 * 6;
 constexpr fbase i32max     = 2147483647.0;
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
diff --git a/examples/window.cpp b/examples/window.cpp
@@ -10,7 +10,7 @@
 
 using namespace kfr;
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -41,7 +41,7 @@ namespace intrinsics
 template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
 KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
 {
-    return x & internal::invhighbitmask<T>;
+    return x & constants<T>::invhighbitmask();
 }
 
 KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); }
@@ -86,7 +86,7 @@ KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); }
 #if defined CMT_ARCH_NEON64
 KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); }
 #else
-KFR_SINTRIN f64neon abs(const f64neon& x) { return x & internal::invhighbitmask<f64>; }
+KFR_SINTRIN f64neon abs(const f64neon& x) { return x & constants<f64>::invhighbitmask(); }
 #endif
 
 KFR_HANDLE_ALL_SIZES_1(abs)
@@ -97,14 +97,14 @@ KFR_HANDLE_ALL_SIZES_1(abs)
 template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
 KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
 {
-    return x & internal::invhighbitmask<T>;
+    return x & constants<T>::invhighbitmask();
 }
 
 // fallback
 template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
 KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
 {
-    return select(x >= T(), x, -x);
+    return select(x >= T(0), x, -x);
 }
 #endif
 KFR_I_CONVERTER(abs)
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -36,8 +36,9 @@ namespace kfr
 namespace intrinsics
 {
 template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2k(vec<f32, N> y, vec<f32, N> x)
+KFR_SINTRIN vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx)
 {
+    vec<f32, N> x = xx, y = yy;
     vec<f32, N> s, t, u;
     vec<i32, N> q;
     q = select(x < 0, -2, 0);
@@ -64,8 +65,9 @@ KFR_SINTRIN vec<f32, N> atan2k(vec<f32, N> y, vec<f32, N> x)
 }
 
 template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2k(vec<f64, N> y, vec<f64, N> x)
+KFR_SINTRIN vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx)
 {
+    vec<f64, N> x = xx, y = yy;
     vec<f64, N> s, t, u;
     vec<i64, N> q;
     q = select(x < 0, i64(-2), i64(0));
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -276,7 +276,7 @@ struct expression_linspace<T, true> : input_expression
         return mix((enumerate(x) + cast<T>(cast<TI>(index))) * invsize, cast<T>(start), cast<T>(stop));
     }
     template <typename U, size_t N>
-    CMT_INLINE static vec<U, N> mix(vec<U, N> t, U x, U y)
+    CMT_INLINE static vec<U, N> mix(const vec<U, N>& t, U x, U y)
     {
         return (U(1.0) - t) * x + t * y;
     }
@@ -296,12 +296,12 @@ public:
     using T          = value_type;
 
     template <typename... Expr_>
-    CMT_INLINE expression_sequence(const size_t (&segments)[base::size], Expr_&&... expr) noexcept
+    CMT_INLINE expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) noexcept
         : base(std::forward<Expr_>(expr)...)
     {
         std::copy(std::begin(segments), std::end(segments), this->segments.begin() + 1);
-        this->segments[0]              = 0;
-        this->segments[base::size + 1] = size_t(-1);
+        this->segments[0]               = 0;
+        this->segments[base::count + 1] = size_t(-1);
     }
 
     template <size_t N>
@@ -314,7 +314,7 @@ public:
         else
         {
             vec<T, N> result;
-#pragma clang loop unroll_count(4)
+            CMT_PRAGMA_CLANG(clang loop unroll_count(4))
             for (size_t i = 0; i < N; i++)
             {
                 sindex           = segments[sindex + 1] == index ? sindex + 1 : sindex;
@@ -329,12 +329,12 @@ protected:
     template <size_t N>
     CMT_NOINLINE vec<T, N> get(cinput_t cinput, size_t index, size_t expr_index, vec_t<T, N> y)
     {
-        return cswitch(indicesfor<E...>, expr_index,
+        return cswitch(indicesfor_t<E...>(), expr_index,
                        [&](auto val) { return this->argument(cinput, val, index, y); },
                        [&]() { return zerovector(y); });
     }
 
-    std::array<size_t, base::size + 2> segments;
+    std::array<size_t, base::count + 2> segments;
 };
 
 template <typename Fn, typename E>
@@ -475,7 +475,7 @@ struct multioutput : output_expression
     template <typename T, size_t N>
     void operator()(coutput_t coutput, size_t index, const vec<T, N>& x)
     {
-        cfor(csize<0>, csize<sizeof...(E)>,
+        cfor(csize_t<0>(), csize_t<sizeof...(E)>(),
              [&](auto n) { std::get<val_of(decltype(n)())>(outputs)(coutput, index, x); });
     }
     std::tuple<E...> outputs;
@@ -517,7 +517,7 @@ struct expression_unpack : private expression<E...>, output_expression
     template <typename U, size_t N>
     CMT_INLINE void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x)
     {
-        output(coutput, index, x, csizeseq<count>);
+        output(coutput, index, x, csizeseq_t<count>());
     }
 
     template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
@@ -576,9 +576,9 @@ task_partition<OutExpr, InExpr> partition(OutExpr&& output, InExpr&& input, size
 {
     static_assert(!is_infinite<OutExpr>::value || !is_infinite<InExpr>::value, "");
 
-    minimum_size            = minimum_size == 0 ? vector_width<T> * 8 : minimum_size;
+    minimum_size            = minimum_size == 0 ? platform<T>::vector_width * 8 : minimum_size;
     const size_t size       = size_min(output.size(), input.size());
-    const size_t chunk_size = align_up(std::max(size / count, minimum_size), vector_width<T>);
+    const size_t chunk_size = align_up(std::max(size / count, minimum_size), platform<T>::vector_width);
 
     task_partition<OutExpr, InExpr> result(std::forward<OutExpr>(output), std::forward<InExpr>(input), size,
                                            chunk_size, (size + chunk_size - 1) / chunk_size);
diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp
@@ -40,6 +40,9 @@
 #include <complex>
 #endif
 
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4814))
+
 namespace kfr
 {
 #ifdef KFR_STD_COMPLEX
@@ -72,8 +75,13 @@ struct complex
     constexpr complex(complex<U>&& other) noexcept : re(std::move(other.re)), im(std::move(other.im))
     {
     }
+#ifdef CMT_COMPILER_GNU
     constexpr complex& operator=(const complex&) noexcept = default;
     constexpr complex& operator=(complex&&) noexcept = default;
+#else
+    complex& operator=(const complex&) = default;
+    complex& operator=(complex&&) = default;
+#endif
     constexpr const T& real() const noexcept { return re; }
     constexpr const T& imag() const noexcept { return im; }
     constexpr void real(T value) noexcept { re = value; }
@@ -158,7 +166,7 @@ struct compound_type_traits<kfr::complex<T>>
     template <typename U>
     using rebind = kfr::complex<U>;
     template <typename U>
-    using deep_rebind = kfr::complex<cometa::deep_rebind<subtype, U>>;
+    using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>;
 
     static constexpr subtype at(const kfr::complex<T>& value, size_t index)
     {
@@ -189,15 +197,15 @@ struct vec_op<complex<T>, N> : private vec_op<T, N * 2>
 
     constexpr static size_t w = N * 2;
 
-    CMT_INLINE constexpr static simd<scalar_type, w> mul(const simd<scalar_type, w>& x,
-                                                         const simd<scalar_type, w>& y) noexcept
+    CMT_INLINE static simd<scalar_type, w> mul(const simd<scalar_type, w>& x,
+                                               const simd<scalar_type, w>& y) noexcept
     {
         const vec<scalar_type, w> xx = x;
         const vec<scalar_type, w> yy = y;
         return *subadd(xx * dupeven(yy), swap<2>(xx) * dupodd(yy));
     }
-    CMT_INLINE constexpr static simd<scalar_type, w> div(const simd<scalar_type, w>& x,
-                                                         const simd<scalar_type, w>& y) noexcept
+    CMT_INLINE static simd<scalar_type, w> div(const simd<scalar_type, w>& x,
+                                               const simd<scalar_type, w>& y) noexcept
     {
         const vec<scalar_type, w> xx = x;
         const vec<scalar_type, w> yy = y;
@@ -670,4 +678,26 @@ struct common_type<T1, kfr::complex<T2>>
 {
     using type = kfr::complex<typename common_type<T1, T2>::type>;
 };
+template <typename T1, typename T2, size_t N>
+struct common_type<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>>
+{
+    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
+};
+template <typename T1, typename T2, size_t N>
+struct common_type<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>>
+{
+    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
+};
+template <typename T1, typename T2, size_t N>
+struct common_type<kfr::complex<T1>, kfr::vec<T2, N>>
+{
+    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
+};
+template <typename T1, typename T2, size_t N>
+struct common_type<kfr::vec<T1, N>, kfr::complex<T2>>
+{
+    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
+};
 }
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/constants.hpp b/include/kfr/base/constants.hpp
@@ -31,66 +31,256 @@
 namespace kfr
 {
 
+#if CMT_COMPILER_GNU
+constexpr double infinity = __builtin_inf();
+constexpr double qnan     = __builtin_nan("");
+#else
+constexpr double infinity = HUGE_VAL;
+constexpr double qnan     = NAN;
+#endif
+
+template <typename T>
+struct constants
+{
+public:
+    using Tsub = subtype<T>;
+
+    constexpr static Tsub pi_s(int m, int d = 1) { return pi * m / d; }
+    constexpr static Tsub recip_pi_s(int m, int d = 1) { return recip_pi * m / d; }
+
+    constexpr static Tsub pi           = static_cast<Tsub>(3.1415926535897932384626433832795);
+    constexpr static Tsub sqr_pi       = static_cast<Tsub>(9.8696044010893586188344909998762);
+    constexpr static Tsub recip_pi     = static_cast<Tsub>(0.31830988618379067153776752674503);
+    constexpr static Tsub degtorad     = static_cast<Tsub>(pi / 180);
+    constexpr static Tsub radtodeg     = static_cast<Tsub>(pi * 180);
+    constexpr static Tsub e            = static_cast<Tsub>(2.718281828459045235360287471352662);
+    constexpr static Tsub recip_log_2  = static_cast<Tsub>(1.442695040888963407359924681001892137426645954);
+    constexpr static Tsub recip_log_10 = static_cast<Tsub>(0.43429448190325182765112891891661);
+    constexpr static Tsub log_2        = static_cast<Tsub>(0.69314718055994530941723212145818);
+    constexpr static Tsub log_10       = static_cast<Tsub>(2.3025850929940456840179914546844);
+    constexpr static Tsub sqrt_2       = static_cast<Tsub>(1.4142135623730950488016887242097);
+
+    constexpr static Tsub fold_constant_div = choose_const<Tsub>(
+        CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01));
+
+    constexpr static Tsub fold_constant_hi = choose_const<Tsub>(
+        CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01));
+    constexpr static Tsub fold_constant_rem1 =
+        choose_const<Tsub>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f),
+                           CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08));
+    constexpr static Tsub fold_constant_rem2 =
+        choose_const<Tsub>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f),
+                           CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15));
+
+    constexpr static Tsub epsilon     = std::numeric_limits<Tsub>::epsilon();
+    constexpr static Tsub infinity    = std::numeric_limits<Tsub>::infinity();
+    constexpr static Tsub neginfinity = -std::numeric_limits<Tsub>::infinity();
+    constexpr static Tsub qnan        = std::numeric_limits<Tsub>::quiet_NaN();
+
+#if CMT_COMPILER_GNU
+
+    constexpr static Tsub allones()
+    {
+        if (is_same<Tsub, f32>::value)
+        {
+            return -__builtin_nanf("0xFFFFFFFF");
+        }
+        else if (is_same<Tsub, f64>::value)
+        {
+            return -__builtin_nan("0xFFFFFFFFFFFFFFFF");
+        }
+        else
+        {
+            return static_cast<Tsub>(-1ll);
+        }
+    }
+
+    constexpr static Tsub allzeros() { return Tsub(0); }
+
+    constexpr static Tsub highbitmask()
+    {
+        if (is_same<Tsub, f32>::value)
+        {
+            return -0.0f;
+        }
+        else if (is_same<Tsub, f64>::value)
+        {
+            return -0.0;
+        }
+        else
+        {
+            return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1));
+        }
+    }
+
+    constexpr static Tsub invhighbitmask()
+    {
+        if (is_same<Tsub, f32>::value)
+        {
+            return __builtin_nanf("0xFFFFFFFF");
+        }
+        else if (is_same<Tsub, f64>::value)
+        {
+            return __builtin_nan("0xFFFFFFFFFFFFFFFF");
+        }
+        else
+        {
+            return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1);
+        }
+    }
+#else
+
+    static Tsub allones()
+    {
+        if (is_same<Tsub, f32>::value)
+        {
+            return static_cast<Tsub>(bitcast<f32>(0xFFFFFFFFu));
+        }
+        else if (is_same<Tsub, f64>::value)
+        {
+            return static_cast<Tsub>(bitcast<f64>(0xFFFFFFFFFFFFFFFFull));
+        }
+        else
+        {
+            return static_cast<Tsub>(-1ll);
+        }
+    }
+
+    constexpr static Tsub allzeros() { return Tsub(0); }
+
+    static Tsub highbitmask()
+    {
+        if (is_same<Tsub, f32>::value)
+        {
+            return static_cast<Tsub>(-0.0f);
+        }
+        else if (is_same<Tsub, f64>::value)
+        {
+            return static_cast<Tsub>(-0.0);
+        }
+        else
+        {
+            return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1));
+        }
+    }
+
+    static Tsub invhighbitmask()
+    {
+        if (is_same<Tsub, f32>::value)
+        {
+            return static_cast<Tsub>(bitcast<f32>(0x7FFFFFFFu));
+        }
+        else if (is_same<Tsub, f64>::value)
+        {
+            return static_cast<Tsub>(bitcast<f64>(0x7FFFFFFFFFFFFFFFull));
+        }
+        else
+        {
+            return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1);
+        }
+    }
+#endif
+};
+
+template <typename T>
+constexpr subtype<T> constants<T>::pi;
+template <typename T>
+constexpr subtype<T> constants<T>::sqr_pi;
+template <typename T>
+constexpr subtype<T> constants<T>::recip_pi;
+template <typename T>
+constexpr subtype<T> constants<T>::degtorad;
+template <typename T>
+constexpr subtype<T> constants<T>::radtodeg;
+template <typename T>
+constexpr subtype<T> constants<T>::e;
+template <typename T>
+constexpr subtype<T> constants<T>::recip_log_2;
+template <typename T>
+constexpr subtype<T> constants<T>::recip_log_10;
+template <typename T>
+constexpr subtype<T> constants<T>::log_2;
+template <typename T>
+constexpr subtype<T> constants<T>::log_10;
+template <typename T>
+constexpr subtype<T> constants<T>::sqrt_2;
+template <typename T>
+constexpr subtype<T> constants<T>::fold_constant_div;
+template <typename T>
+constexpr subtype<T> constants<T>::fold_constant_hi;
+template <typename T>
+constexpr subtype<T> constants<T>::fold_constant_rem1;
+template <typename T>
+constexpr subtype<T> constants<T>::fold_constant_rem2;
+template <typename T>
+constexpr subtype<T> constants<T>::epsilon;
+template <typename T>
+constexpr subtype<T> constants<T>::infinity;
+template <typename T>
+constexpr subtype<T> constants<T>::neginfinity;
+template <typename T>
+constexpr subtype<T> constants<T>::qnan;
+
 // π (pi)
 // c_pi<f64, 4>      = 4pi
 // c_pi<f64, 3, 4>   = 3/4pi
-template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>>
-constexpr Tsub c_pi = Tsub(3.1415926535897932384626433832795 * m / d);
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d);
 
 // π² (pi²)
 // c_sqr_pi<f64, 4>      = 4pi²
 // c_sqr_pi<f64, 3, 4>   = 3/4pi²
-template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>>
-constexpr Tsub c_sqr_pi = Tsub(9.8696044010893586188344909998762 * m / d);
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d);
 
 // 1/π (1/pi)
 // c_recip_pi<f64>       1/pi
 // c_recip_pi<f64, 4>    4/pi
-template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>>
-constexpr Tsub c_recip_pi = Tsub(0.31830988618379067153776752674503 * m / d);
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d);
 
 // degree to radian conversion factor
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_degtorad = c_pi<T, 1, 180>;
+template <typename T>
+constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>;
 
 // radian to degree conversion factor
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_radtodeg = c_recip_pi<T, 180>;
+template <typename T>
+constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>;
 
 // e, Euler's number
-template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>>
-constexpr Tsub c_e = Tsub(2.718281828459045235360287471352662 * m / d);
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d);
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr unsigned c_mantissa_bits = sizeof(Tsub) == 32 ? 23 : 52;
+template <typename T>
+constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52;
 
-template <typename T, typename Tsub = usubtype<T>>
-constexpr Tsub c_mantissa_mask = (Tsub(1) << c_mantissa_bits<T>)-1;
+template <typename T>
+constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1;
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_epsilon = (std::numeric_limits<Tsub>::epsilon());
+template <typename T>
+constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon());
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_infinity = std::numeric_limits<Tsub>::infinity();
+template <typename T>
+constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity();
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_neginfinity = -std::numeric_limits<Tsub>::infinity();
+template <typename T>
+constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity();
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_qnan = std::numeric_limits<Tsub>::quiet_NaN();
+template <typename T>
+constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN();
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_recip_log_2 = Tsub(1.442695040888963407359924681001892137426645954);
+template <typename T>
+constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954);
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_recip_log_10 = Tsub(0.43429448190325182765112891891661);
+template <typename T>
+constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661);
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_log_2 = Tsub(0.69314718055994530941723212145818);
+template <typename T>
+constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818);
 
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub c_log_10 = Tsub(2.3025850929940456840179914546844);
+template <typename T>
+constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844);
 
-template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>>
-constexpr Tsub c_sqrt_2 = Tsub(1.4142135623730950488016887242097 * m / d);
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d);
 }
diff --git a/include/kfr/base/cpuid.hpp b/include/kfr/base/cpuid.hpp
@@ -25,6 +25,10 @@
  */
 #pragma once
 
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
 #include "types.hpp"
 #include <cstring>
 
@@ -121,9 +125,21 @@ CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
 CMT_INLINE u32 get_xcr0()
 {
     u32 xcr0;
-    __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
+    __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
     return xcr0;
 }
+#elif defined CMT_COMPILER_MSVC
+
+CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0) { __cpuidex((int*)ptr, (int)func, (int)subfunc); }
+CMT_INLINE u32 get_xcr0()
+{
+#ifdef _XCR_XFEATURE_ENABLED_MASK
+    unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (u32)Result;
+#else
+    return 0;
+#endif
+}
 #endif
 
 template <size_t = 0>
diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp
@@ -48,13 +48,14 @@ constexpr inline u32 bit_permute_step_simple(u32 x, u32 m, u32 shift)
     return ((x & m) << shift) | ((x >> shift) & m);
 }
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshift-count-overflow"
-#pragma GCC diagnostic ignored "-Wshift-count-negative"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative")
 
 template <size_t radix, size_t bits>
 constexpr enable_if<radix == 4, u32> digitreverse(u32 x)
 {
+#ifdef CMT_COMPILER_GNU
     if (bits <= 2)
         return x;
     if (bits <= 4)
@@ -84,9 +85,16 @@ constexpr enable_if<radix == 4, u32> digitreverse(u32 x)
         return x >> (32 - bits);
     }
     return x;
+#else
+    x = bit_permute_step_simple(x, 0x33333333, 2); // Bit index complement 1      regroups 4 bits
+    x = bit_permute_step_simple(x, 0x0f0f0f0f, 4); // Bit index complement 2      regroups 8 bits
+    x = bit_permute_step_simple(x, 0x00ff00ff, 8); // Bit index complement 3      regroups 16 bits
+    x = bit_permute_step_simple(x, 0x0000ffff, 16); // Bit index complement 4     regroups 32 bits
+    return x >> (32 - bits);
+#endif
 }
 
-#pragma GCC diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 
 template <size_t radix, size_t bits>
 struct shuffle_index_digitreverse
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -31,8 +31,8 @@
 
 #include <tuple>
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wshadow"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 
 namespace kfr
 {
@@ -399,9 +399,9 @@ CMT_INLINE size_t process(OutputExpr&& out, const InputExpr& in, size_t start = 
     in.begin_block(cinput, size);
 
 #ifdef NDEBUG
-    constexpr size_t w = width == 0 ? internal::get_vector_width<Tin, c>(2, 4) : width;
+    constexpr size_t w = width == 0 ? platform<Tin, c>::vector_width * bitness_const(2, 4) : width;
 #else
-    constexpr size_t w = width == 0 ? internal::get_vector_width<Tin, c>(1, 1) : width;
+    constexpr size_t w = width == 0 ? platform<Tin, c>::vector_width * bitness_const(1, 1) : width;
 #endif
 
     size_t i = start;
@@ -442,4 +442,5 @@ struct output_expression_base : output_expression
     }
 };
 }
-#pragma clang diagnostic pop
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -30,8 +30,8 @@
 #include "types.hpp"
 #include "vec.hpp"
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wshadow"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 
 namespace kfr
 {
@@ -127,10 +127,11 @@ template <cpu_t c, typename T>
 constexpr inline size_t next_simd_width(size_t n)
 {
 #ifdef CMT_ARCH_X86
-    return n > vector_width<T, cpu_t::sse2> ? vector_width<T, c> : vector_width<T, cpu_t::sse2>;
+    return n > platform<T, cpu_t::sse2>::vector_width ? platform<T, c>::vector_width
+                                                      : platform<T, cpu_t::sse2>::vector_width;
 #endif
 #ifdef CMT_ARCH_ARM
-    return vector_width<T, cpu_t::neon>;
+    return platform<T, cpu_t::neon>::vector_width;
 #endif
 }
 
@@ -147,116 +148,112 @@ KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value)
 }
 
 #define KFR_HANDLE_ALL_SIZES_1(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>                       \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a)));                                                              \
     }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return concat(fn(low(a)), fn(high(a)));                                                              \
     }
 
 #define KFR_HANDLE_ALL_SIZES_FLT_1(fn)                                                                       \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>                       \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
     KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a)                                                   \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(cast<flt_type<T>>(a))));                                           \
     }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
     KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a)                                                   \
     {                                                                                                        \
         return concat(fn(low(cast<flt_type<T>>(a))), fn(high(cast<flt_type<T>>(a))));                        \
     }
 
 #define KFR_HANDLE_ALL_SIZES_F_1(fn)                                                                         \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_f_class<T>::value)>                     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_f_class<T>::value)>    \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a)));                                                              \
     }                                                                                                        \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_f_class<T>::value), typename = void>   \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_f_class<T>::value),   \
+              typename = void>                                                                               \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return concat(fn(low(a)), fn(high(a)));                                                              \
     }
 
 #define KFR_HANDLE_ALL_SIZES_I_1(fn)                                                                         \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_i_class<T>::value)>                     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_i_class<T>::value)>    \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a)));                                                              \
     }                                                                                                        \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_i_class<T>::value), typename = void>   \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_i_class<T>::value),   \
+              typename = void>                                                                               \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return concat(fn(low(a)), fn(high(a)));                                                              \
     }
 
 #define KFR_HANDLE_ALL_SIZES_U_1(fn)                                                                         \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_u_class<T>::value)>                     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_u_class<T>::value)>    \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a)));                                                              \
     }                                                                                                        \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_u_class<T>::value), typename = void>   \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_u_class<T>::value),   \
+              typename = void>                                                                               \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return concat(fn(low(a)), fn(high(a)));                                                              \
     }
 
 #define KFR_HANDLE_ALL_SIZES_NOT_F_1(fn)                                                                     \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && !is_f_class<T>::value)>                    \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && !is_f_class<T>::value)>   \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a)));                                                              \
     }                                                                                                        \
-    template <typename T, size_t N,                                                                          \
-              KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && !is_f_class<T>::value), typename = void>  \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && !is_f_class<T>::value),  \
+              typename = void>                                                                               \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
     {                                                                                                        \
         return concat(fn(low(a)), fn(high(a)));                                                              \
     }
 
 #define KFR_HANDLE_ALL_SIZES_2(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>                       \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b)                                         \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a), expand_simd(b)));                                              \
     }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b)                                         \
     {                                                                                                        \
         return concat(fn(low(a), low(b)), fn(high(a), high(b)));                                             \
     }
 
 #define KFR_HANDLE_ALL_SIZES_3(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>                       \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)                     \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c)));                              \
     }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)                     \
     {                                                                                                        \
         return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c)));                            \
     }
 
 #define KFR_HANDLE_ALL_SIZES_4(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>                       \
+    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
     {                                                                                                        \
         return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d)));              \
     }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>     \
+    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
     KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
     {                                                                                                        \
         return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d)));           \
@@ -277,4 +274,4 @@ inline T to_scalar(const vec<T, 1>& value)
 }
 }
 }
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp
@@ -27,9 +27,9 @@
 #include "function.hpp"
 #include "log_exp.hpp"
 
-#pragma clang diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wc99-extensions")
-#pragma clang diagnostic ignored "-Wc99-extensions"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
 #endif
 
 namespace kfr
@@ -92,4 +92,4 @@ KFR_INTRIN internal::expression_function<fn::factorial_approx, E1> factorial_app
 }
 }
 
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -58,7 +58,7 @@ protected:
     template <size_t N>
     void call_shift(csize_t<N>) const
     {
-        ptr_cast<Class>(this)->shift(csize<N>);
+        ptr_cast<Class>(this)->shift(csize_t<N>());
     }
 
     template <size_t N>
@@ -81,7 +81,7 @@ protected:
     CMT_INLINE vec<T, N> generate(vec_t<T, N>) const
     {
         const vec<T, N> result = narrow<N>(value);
-        shift(csize<N>);
+        shift(csize_t<N>());
         return result;
     }
 
@@ -96,7 +96,7 @@ protected:
     mutable vec<T, width> value;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)>
+template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2)>
 struct generator_linear : generator<T, width, generator_linear<T, width>>
 {
     constexpr generator_linear(T start, T step) noexcept : step(step), vstep(step* width)
@@ -113,7 +113,7 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
 struct generator_exp : generator<T, width, generator_exp<T, width>>
 {
     generator_exp(T start, T step) noexcept : step(step), vstep(exp(make_vector(step* width))[0] - 1)
@@ -130,7 +130,7 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
 struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
 {
     generator_exp2(T start, T step) noexcept : step(step), vstep(exp2(make_vector(step* width))[0] - 1)
@@ -147,7 +147,7 @@ protected:
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
 struct generator_cossin : generator<T, width, generator_cossin<T, width>>
 {
     generator_cossin(T start, T step)
@@ -172,7 +172,7 @@ protected:
     }
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(2, 4), KFR_ARCH_DEP>
+template <typename T, size_t width = platform<T>::vector_width* bitness_const(2, 4), KFR_ARCH_DEP>
 struct generator_sin : generator<T, width, generator_sin<T, width>>
 {
     generator_sin(T start, T step)
diff --git a/include/kfr/base/kfr.h b/include/kfr/base/kfr.h
@@ -14,6 +14,14 @@
 #define KFR_VERSION_BUILD 0
 #define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_BUILD)
 
+#ifdef CMT_ARCH_X64
+#define KFR_VERSION_FULL                                                                                     \
+    "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 64-bit (" CMT_COMPIER_NAME ")"
+#else
+#define KFR_VERSION_FULL                                                                                     \
+    "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 32-bit (" CMT_COMPIER_NAME ")"
+#endif
+
 #ifdef __cplusplus
 namespace kfr
 {
@@ -22,14 +30,9 @@ constexpr int version_major           = KFR_VERSION_MAJOR;
 constexpr int version_minor           = KFR_VERSION_MINOR;
 constexpr int version_build           = KFR_VERSION_BUILD;
 constexpr int version                 = KFR_VERSION;
+constexpr const char version_full[]   = KFR_VERSION_FULL;
 }
 #endif
 
-#ifdef CMT_ARCH_X64
-#define KFR_VERSION_FULL "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 64-bit"
-#else
-#define KFR_VERSION_FULL "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 32-bit"
-#endif
-
 #define KFR_INTRIN CMT_INTRIN
 #define KFR_SINTRIN CMT_INTRIN static
diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp
@@ -96,7 +96,7 @@ KFR_SINTRIN vec<f32, N> log(const vec<f32, N>& d)
     vec<f32, N> x  = (m - 1.0f) / (m + 1.0f);
     vec<f32, N> x2 = x * x;
 
-    vec<f32, N> sp = select(d < 0, c_qnan<f32>, c_neginfinity<f32>);
+    vec<f32, N> sp = select(d < 0, constants<f32>::qnan, constants<f32>::neginfinity);
 
     vec<f32, N> t = 0.2371599674224853515625f;
     t = fmadd(t, x2, 0.285279005765914916992188f);
@@ -119,7 +119,7 @@ KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d)
     vec<f64, N> x  = (m - 1.0) / (m + 1.0);
     vec<f64, N> x2 = x * x;
 
-    vec<f64, N> sp = select(d < 0, c_qnan<f64>, c_neginfinity<f64>);
+    vec<f64, N> sp = select(d < 0, constants<f64>::qnan, constants<f64>::neginfinity);
 
     vec<f64, N> t = 0.148197055177935105296783;
     t = fmadd(t, x2, 0.153108178020442575739679);
@@ -130,7 +130,7 @@ KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d)
     t = fmadd(t, x2, 0.666666666666685503450651);
     t = fmadd(t, x2, 2);
 
-    x = x * t + c_log_2<f64> * cast<f64>(e);
+    x = x * t + constants<f64>::log_2 * cast<f64>(e);
     x = select(d > 0, x, sp);
 
     return x;
@@ -139,12 +139,12 @@ KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d)
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> log2(const vec<T, N>& x)
 {
-    return log(cast<Tout>(x)) * c_recip_log_2<Tout>;
+    return log(cast<Tout>(x)) * constants<Tout>::recip_log_2;
 }
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> log10(const vec<T, N>& x)
 {
-    return log(cast<Tout>(x)) * c_recip_log_10<Tout>;
+    return log(cast<Tout>(x)) * constants<Tout>::recip_log_10;
 }
 
 template <size_t N>
@@ -153,7 +153,7 @@ KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d)
     const f32 ln2_part1 = 0.6931457519f;
     const f32 ln2_part2 = 1.4286067653e-6f;
 
-    vec<i32, N> q = cast<i32>(floor(d * c_recip_log_2<f32>));
+    vec<i32, N> q = cast<i32>(floor(d * constants<f32>::recip_log_2));
     vec<f32, N> s, u;
 
     s = fmadd(cast<f32>(q), -ln2_part1, d);
@@ -176,7 +176,7 @@ KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d)
     u = s * s * u + s + 1.0f;
     u = vldexpk(u, q);
 
-    u = select(d == c_neginfinity<f32>, 0.f, u);
+    u = select(d == constants<f32>::neginfinity, 0.f, u);
 
     return u;
 }
@@ -187,7 +187,7 @@ KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d)
     const f64 ln2_part1 = 0.69314717501401901245;
     const f64 ln2_part2 = 5.545926273775592108e-009;
 
-    vec<i64, N> q = cast<i64>(floor(d * c_recip_log_2<f64>));
+    vec<i64, N> q = cast<i64>(floor(d * +constants<f64>::recip_log_2));
     vec<f64, N> s, u;
 
     s = fmadd(cast<f64>(q), -ln2_part1, d);
@@ -218,19 +218,19 @@ KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d)
     u = s * s * u + s + 1.0;
     u = vldexpk(u, q);
 
-    u = select(d == c_neginfinity<f64>, 0.0, u);
+    u = select(d == constants<f64>::neginfinity, 0.0, u);
 
     return u;
 }
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> exp2(const vec<T, N>& x)
 {
-    return exp(x * c_log_2<Tout>);
+    return exp(x * constants<Tout>::log_2);
 }
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> exp10(const vec<T, N>& x)
 {
-    return exp(x * c_log_10<Tout>);
+    return exp(x * constants<Tout>::log_10);
 }
 
 template <typename T, size_t N>
@@ -239,8 +239,8 @@ KFR_SINTRIN vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b)
     const vec<T, N> t       = exp(b * log(abs(a)));
     const mask<T, N> isint  = floor(b) == b;
     const mask<T, N> iseven = (cast<itype<T>>(b) & 1) == 0;
-    return select(a > T(), t,
-                  select(a == T(), T(1), select(isint, select(iseven, t, -t), broadcast<N>(c_qnan<T>))));
+    return select(a > T(), t, select(a == T(), T(1),
+                                     select(isint, select(iseven, t, -t), broadcast<N>(constants<T>::qnan))));
 }
 
 template <typename T, size_t N>
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -139,23 +139,23 @@ KFR_SINTRIN bool bittestall(const i32sse& x) { return !_mm_movemask_epi8(*~x); }
 KFR_SINTRIN bool bittestall(const i64sse& x) { return !_mm_movemask_epi8(*~x); }
 #endif
 
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
 KFR_SINTRIN bool bittestall(const vec<T, N>& a)
 {
     return bittestall(expand_simd(a, internal::maskbits<T>(true)));
 }
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
 KFR_SINTRIN bool bittestall(const vec<T, N>& a)
 {
     return bittestall(low(a)) && bittestall(high(a));
 }
 
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
 KFR_SINTRIN bool bittestany(const vec<T, N>& a)
 {
     return bittestany(expand_simd(a, internal::maskbits<T>(false)));
 }
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
 KFR_SINTRIN bool bittestany(const vec<T, N>& a)
 {
     return bittestany(low(a)) || bittestany(high(a));
@@ -192,23 +192,23 @@ KFR_SINTRIN bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a
 KFR_SINTRIN bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
 KFR_SINTRIN bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
 
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
 KFR_SINTRIN bool bittestall(const vec<T, N>& a)
 {
     return bittestall(expand_simd(a, internal::maskbits<T>(true)));
 }
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
 KFR_SINTRIN bool bittestall(const vec<T, N>& a)
 {
     return bittestall(low(a)) && bittestall(high(a));
 }
 
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
 KFR_SINTRIN bool bittestany(const vec<T, N>& a)
 {
     return bittestany(expand_simd(a, internal::maskbits<T>(false)));
 }
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
 KFR_SINTRIN bool bittestany(const vec<T, N>& a)
 {
     return bittestany(low(a)) || bittestany(high(a));
diff --git a/include/kfr/base/memory.hpp b/include/kfr/base/memory.hpp
@@ -57,7 +57,11 @@ struct mem_header
     u8 reserved1;
     u8 reserved2;
     size_t size;
-} __attribute__((__packed__));
+}
+#ifdef CMT_GNU_ATTRIBUTES
+__attribute__((__packed__))
+#endif
+;
 
 inline mem_header* aligned_header(void* ptr) { return ptr_cast<mem_header>(ptr) - 1; }
 
@@ -85,11 +89,12 @@ inline void aligned_free(void* ptr)
 }
 }
 
-template <typename T = void, size_t alignment = native_cache_alignment>
+template <typename T = void, size_t alignment = platform<>::native_cache_alignment>
 CMT_INLINE T* aligned_allocate(size_t size = 1)
 {
     T* ptr = static_cast<T*>(CMT_ASSUME_ALIGNED(
-        internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>), alignment), alignment));
+        internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment),
+        alignment));
     return ptr;
 }
 
diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp
@@ -27,9 +27,9 @@
 #include "function.hpp"
 #include "log_exp.hpp"
 
-#pragma clang diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wc99-extensions")
-#pragma clang diagnostic ignored "-Wc99-extensions"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
 #endif
 
 namespace kfr
@@ -38,50 +38,49 @@ namespace kfr
 namespace intrinsics
 {
 
-template <typename T>
-constexpr T bessel_coef[] = { T(0.25),
-                              T(0.027777777777777776236),
-                              T(0.0017361111111111110147),
-                              T(6.9444444444444444384e-005),
-                              T(1.9290123456790123911e-006),
-                              T(3.9367598891408417495e-008),
-                              T(6.1511873267825652335e-010),
-                              T(7.5940584281266239246e-012),
-                              T(7.5940584281266233693e-014),
-                              T(6.2760813455591932909e-016),
-                              T(4.3583898233049949985e-018),
-                              T(2.5789288895295827557e-020),
-                              T(1.3157800456783586208e-022),
-                              T(5.8479113141260384983e-025),
-                              T(2.2843403570804837884e-027),
-                              T(7.904291893012054025e-030),
-                              T(2.4395962632753252792e-032),
-                              T(6.75788438580422547e-035),
-                              T(1.689471096451056426e-037),
-                              T(3.8310002187098784929e-040),
-                              T(7.9152897080782616517e-043),
-                              T(1.4962740468957016443e-045),
-                              T(2.5976979980828152196e-048),
-                              T(4.1563167969325041577e-051),
-                              T(6.1483976285983795968e-054),
-                              T(8.434015951438105991e-057),
-                              T(1.0757673407446563809e-059),
-                              T(1.2791526049282476926e-062),
-                              T(1.4212806721424974034e-065),
-                              T(1.4789601166935457918e-068),
-                              T(1.4442969889585408123e-071),
-                              T(1.3262598613026086927e-074),
-                              T(1.1472836170437790782e-077),
-                              T(9.3655805472961564331e-081),
-                              T(7.2265282000741942594e-084),
-                              T(5.2786911614858977913e-087),
-                              T(3.6556032974279072401e-090),
-                              T(2.4034209713529963119e-093),
-                              T(1.5021381070956226783e-096) };
-
 template <typename T, size_t N>
 CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
 {
+    constexpr static T bessel_coef[] = { T(0.25),
+                                         T(0.027777777777777776236),
+                                         T(0.0017361111111111110147),
+                                         T(6.9444444444444444384e-005),
+                                         T(1.9290123456790123911e-006),
+                                         T(3.9367598891408417495e-008),
+                                         T(6.1511873267825652335e-010),
+                                         T(7.5940584281266239246e-012),
+                                         T(7.5940584281266233693e-014),
+                                         T(6.2760813455591932909e-016),
+                                         T(4.3583898233049949985e-018),
+                                         T(2.5789288895295827557e-020),
+                                         T(1.3157800456783586208e-022),
+                                         T(5.8479113141260384983e-025),
+                                         T(2.2843403570804837884e-027),
+                                         T(7.904291893012054025e-030),
+                                         T(2.4395962632753252792e-032),
+                                         T(6.75788438580422547e-035),
+                                         T(1.689471096451056426e-037),
+                                         T(3.8310002187098784929e-040),
+                                         T(7.9152897080782616517e-043),
+                                         T(1.4962740468957016443e-045),
+                                         T(2.5976979980828152196e-048),
+                                         T(4.1563167969325041577e-051),
+                                         T(6.1483976285983795968e-054),
+                                         T(8.434015951438105991e-057),
+                                         T(1.0757673407446563809e-059),
+                                         T(1.2791526049282476926e-062),
+                                         T(1.4212806721424974034e-065),
+                                         T(1.4789601166935457918e-068),
+                                         T(1.4442969889585408123e-071),
+                                         T(1.3262598613026086927e-074),
+                                         T(1.1472836170437790782e-077),
+                                         T(9.3655805472961564331e-081),
+                                         T(7.2265282000741942594e-084),
+                                         T(5.2786911614858977913e-087),
+                                         T(3.6556032974279072401e-090),
+                                         T(2.4034209713529963119e-093),
+                                         T(1.5021381070956226783e-096) };
+
     const vec<T, N> x_2     = x * 0.5;
     const vec<T, N> x_2_sqr = x_2 * x_2;
     vec<T, N> num           = x_2_sqr;
@@ -91,7 +90,7 @@ CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
     CMT_LOOP_UNROLL
     for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
     {
-        result = fmadd((num *= x_2_sqr), bessel_coef<T>[i], result);
+        result = fmadd((num *= x_2_sqr), bessel_coef[i], result);
     }
     return result;
 }
@@ -113,4 +112,4 @@ KFR_INTRIN internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1
 }
 }
 
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp
@@ -83,7 +83,7 @@ KFR_FN(add)
 /**
  * @brief Returns template expression that returns sum of all the arguments passed to a function.
  */
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)>
 CMT_INLINE internal::expression_function<fn::add, E...> add(E&&... x)
 {
     return { fn::add(), std::forward<E>(x)... };
@@ -310,7 +310,7 @@ inline common_type<T1, T2> bitwiseand(const T1& x, const T2& y)
 template <typename T>
 constexpr inline T bitwiseand(initialvalue<T>)
 {
-    return internal::allones<subtype<T>>;
+    return constants<T>::allones();
 }
 KFR_FN(bitwiseand)
 
@@ -323,7 +323,7 @@ inline common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y)
 template <typename T>
 constexpr inline T bitwiseandnot(initialvalue<T>)
 {
-    return internal::allones<subtype<T>>;
+    return constants<T>::allones();
 }
 KFR_FN(bitwiseandnot)
 
@@ -336,7 +336,7 @@ inline common_type<T1, T2> bitwiseor(const T1& x, const T2& y)
 template <typename T>
 constexpr inline T bitwiseor(initialvalue<T>)
 {
-    return subtype<T>();
+    return subtype<T>(0);
 }
 KFR_FN(bitwiseor)
 
@@ -512,14 +512,14 @@ KFR_FN(reciprocal)
 template <typename T1, typename T2>
 CMT_INLINE common_type<T1, T2> mulsign(const T1& x, const T2& y)
 {
-    return x ^ (y & internal::highbitmask<subtype<T2>>);
+    return x ^ (y & constants<T2>::highbitmask());
 }
 KFR_FN(mulsign)
 
 template <typename T, size_t N>
 constexpr CMT_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
 {
-    return (x & internal::highbitmask<T>) | (y & internal::highbitmask<T>);
+    return (x & constants<T>::highbitmask()) | (y & constants<T>::highbitmask());
 }
 
 template <typename T, size_t N>
@@ -531,7 +531,7 @@ CMT_INLINE mask<T, N> isnan(const vec<T, N>& x)
 template <typename T, size_t N>
 CMT_INLINE mask<T, N> isinf(const vec<T, N>& x)
 {
-    return x == c_infinity<T> || x == -c_infinity<T>;
+    return x == constants<T>::infinity || x == -constants<T>::infinity;
 }
 
 template <typename T, size_t N>
@@ -543,7 +543,7 @@ CMT_INLINE mask<T, N> isfinite(const vec<T, N>& x)
 template <typename T, size_t N>
 CMT_INLINE mask<T, N> isnegative(const vec<T, N>& x)
 {
-    return (x & internal::highbitmask<T>) != 0;
+    return (x & constants<T>::highbitmask()) != 0;
 }
 
 template <typename T, size_t N>
diff --git a/include/kfr/base/platform.hpp b/include/kfr/base/platform.hpp
@@ -93,94 +93,81 @@ CMT_UNUSED static const char* cpu_name(cpu_t set)
     return "-";
 }
 
+#ifdef CMT_ARCH_X64
+template <int = 0>
+constexpr inline const char* bitness_const(const char*, const char* x64)
+{
+    return x64;
+}
 template <typename T>
-constexpr inline const T& bitness_const(const T& x32, const T& x64)
+constexpr inline const T& bitness_const(const T&, const T& x64)
 {
-#ifdef CMT_ARCH_X64
-    (void)x32;
     return x64;
+}
 #else
-    (void)x64;
+template <int           = 0>
+constexpr inline const char* bitness_const(const char* x32, const char*)
+{
+    return x32;
+}
+template <typename T>
+constexpr inline const T& bitness_const(const T& x32, const T&)
+{
     return x32;
-#endif
 }
-
-#ifdef CMT_ARCH_X64
-constexpr inline const char* bitness_const(const char*, const char* x64) { return x64; }
-#else
-constexpr inline const char* bitness_const(const char* x32, const char*) { return x32; }
 #endif
 
-constexpr size_t native_cache_alignment        = 64;
-constexpr size_t native_cache_alignment_mask   = native_cache_alignment - 1;
-constexpr size_t maximum_vector_alignment      = 32;
-constexpr size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
-constexpr size_t native_register_count         = bitness_const(8, 16);
+template <typename T = i32, cpu_t c = cpu_t::native>
+struct platform
+{
+    constexpr static size_t native_cache_alignment        = 64;
+    constexpr static size_t native_cache_alignment_mask   = native_cache_alignment - 1;
+    constexpr static size_t maximum_vector_alignment      = 32;
+    constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
+#ifdef CMT_ARCH_X86
+    constexpr static size_t simd_register_count = bitness_const(8, 16);
+#endif
+#ifdef CMT_ARCH_ARM
+    constexpr static size_t simd_register_count = 16;
+#endif
 
-constexpr size_t common_float_vector_size = 16;
-constexpr size_t common_int_vector_size   = 16;
+    constexpr static size_t common_float_vector_size = 16;
+    constexpr static size_t common_int_vector_size   = 16;
 
-template <cpu_t c>
-constexpr size_t native_float_vector_size =
 #ifdef CMT_ARCH_X86
-    c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size;
+    constexpr static size_t native_float_vector_size =
+        c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size;
 #endif
 #ifdef CMT_ARCH_ARM
-c == cpu_t::neon ? 16 : common_float_vector_size;
+    constexpr static size_t native_float_vector_size = c == cpu_t::neon ? 16 : common_float_vector_size;
 #endif
-template <cpu_t c>
-constexpr size_t native_int_vector_size =
 #ifdef CMT_ARCH_X86
-    c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size;
+    constexpr static size_t native_int_vector_size =
+        c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size;
 #endif
 #ifdef CMT_ARCH_ARM
-c == cpu_t::neon ? 16 : common_int_vector_size;
+    constexpr static size_t native_int_vector_size = c == cpu_t::neon ? 16 : common_int_vector_size;
 #endif
 
-/// @brief SIMD vector width for the given cpu instruction set
-template <typename T, cpu_t c = cpu_t::native>
-constexpr size_t vector_width = const_max(size_t(1), typeclass<T> == datatype::f
-                                                         ? native_float_vector_size<c> / sizeof(T)
-                                                         : native_int_vector_size<c> / sizeof(T));
+    /// @brief SIMD vector width for the given cpu instruction set
+    constexpr static size_t vector_width =
+        (const_max(size_t(1), typeclass<T> == datatype::f ? native_float_vector_size / sizeof(T)
+                                                          : native_int_vector_size / sizeof(T)));
 
-template <cpu_t c>
-constexpr size_t vector_width<void, c> = 0;
+    constexpr static size_t vector_capacity = simd_register_count * vector_width;
 
-namespace internal
-{
+    constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity / 4);
 
-template <cpu_t c>
-constexpr size_t native_vector_alignment = const_max(native_float_vector_size<c>, native_int_vector_size<c>);
+    constexpr static size_t native_vector_alignment =
+        const_max(native_float_vector_size, native_int_vector_size);
 
-template <cpu_t c>
-constexpr bool fast_unaligned =
+    constexpr static bool fast_unaligned =
 #ifdef CMT_ARCH_X86
-    c >= cpu_t::avx1;
+        c >= cpu_t::avx1;
 #else
-    false;
+        false;
 #endif
 
-template <cpu_t c>
-constexpr size_t native_vector_alignment_mask = native_vector_alignment<c> - 1;
-
-template <typename T, cpu_t c>
-constexpr inline size_t get_vector_width(size_t scale = 1)
-{
-    return scale * vector_width<T, c>;
-}
-template <typename T, cpu_t c>
-constexpr inline size_t get_vector_width(size_t x32scale, size_t x64scale)
-{
-    return bitness_const(x32scale, x64scale) * vector_width<T, c>;
-}
-
-template <typename T, cpu_t c>
-constexpr auto vector_width_range = csize<1> << csizeseq<ilog2(vector_width<T, c>) + 1>;
-
-template <typename T, cpu_t c>
-constexpr size_t vector_capacity = native_register_count* vector_width<T, c>;
-
-template <typename T, cpu_t c>
-constexpr size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity<T, c> / 4);
-}
+    constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+};
 }
diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp
@@ -34,8 +34,10 @@ namespace kfr
 
 constexpr size_t maximum_expression_width = bitness_const(16, 32);
 
+constexpr size_t expression_vtable_size = 2 + ilog2(maximum_expression_width) + 1;
+
 template <typename T>
-using expression_vtable = std::array<void*, 2 + ilog2(maximum_expression_width) + 1>;
+using expression_vtable = std::array<void*, expression_vtable_size>;
 
 struct dummy_content
 {
@@ -172,12 +174,11 @@ template <typename T, typename E>
 expression_vtable<T> make_expression_vtable_impl()
 {
     expression_vtable<T> result;
-    constexpr size_t size = result.size() - 2;
 
     result[0] = reinterpret_cast<void*>(internal::make_expression_begin_block<decay<E>>());
     result[1] = reinterpret_cast<void*>(internal::make_expression_end_block<decay<E>>());
 
-    cforeach(csizeseq<size>, [&](auto u) {
+    cforeach(csizeseq_t<expression_vtable_size - 2>(), [&](auto u) {
         constexpr size_t N = 1 << val_of(decltype(u)());
         result[2 + val_of(decltype(u)())] =
             reinterpret_cast<void*>(internal::make_expression_func<T, N, decay<E>>());
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -40,27 +40,33 @@ struct seed_from_rdtsc_t
 
 constexpr seed_from_rdtsc_t seed_from_rdtsc{};
 
+#ifdef CMT_COMPILER_CLANG
+#define KFR_builtin_readcyclecounter() __builtin_readcyclecounter()
+#else
+#define KFR_builtin_readcyclecounter() __rdtsc()
+#endif
+
 struct random_bit_generator
 {
     random_bit_generator(seed_from_rdtsc_t) noexcept
-        : state(bitcast<u32>(make_vector(__builtin_readcyclecounter(),
-                                         (__builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
+        : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(),
+                                         (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
     {
         (void)operator()();
     }
-    constexpr random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) noexcept : state(x0, x1, x2, x3)
+    random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) noexcept : state(x0, x1, x2, x3)
     {
         (void)operator()();
     }
-    constexpr random_bit_generator(u64 x0, u64 x1) noexcept : state(bitcast<u32>(make_vector(x0, x1)))
+    random_bit_generator(u64 x0, u64 x1) noexcept : state(bitcast<u32>(make_vector(x0, x1)))
     {
         (void)operator()();
     }
 
     inline random_state operator()()
     {
-        constexpr static random_state mul{ 214013u, 17405u, 214013u, 69069u };
-        constexpr static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
+        CMT_GNU_CONSTEXPR static random_state mul{ 214013u, 17405u, 214013u, 69069u };
+        CMT_GNU_CONSTEXPR static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
         state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add))));
         return state;
     }
diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp
@@ -85,19 +85,19 @@ CMT_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<I
 template <typename T, size_t N>
 CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices)
 {
-    return internal::gather(base, indices, csizeseq<N>);
+    return internal::gather(base, indices, csizeseq_t<N>());
 }
 
 template <size_t Nout, typename T>
 CMT_INLINE vec<T, Nout> gather_stride(const T* base, size_t stride)
 {
-    return internal::gather_stride_s<Nout>(base, stride, csizeseq<Nout>);
+    return internal::gather_stride_s<Nout>(base, stride, csizeseq_t<Nout>());
 }
 
 template <size_t Nout, size_t Stride, typename T>
 CMT_INLINE vec<T, Nout> gather_stride(const T* base)
 {
-    return internal::gather_stride<Nout, Stride>(base, csizeseq<Nout>);
+    return internal::gather_stride<Nout, Stride>(base, csizeseq_t<Nout>());
 }
 
 template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices>
@@ -108,7 +108,7 @@ CMT_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& 
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
 CMT_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
 {
-    return gather_helper<groupsize>(base, offset, csizeseq<N>);
+    return gather_helper<groupsize>(base, offset, csizeseq_t<N>());
 }
 
 template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices>
@@ -121,42 +121,42 @@ CMT_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, N
 template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
 CMT_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
 {
-    return scatter_helper<groupsize>(base, offset, value, csizeseq<N>);
+    return scatter_helper<groupsize>(base, offset, value, csizeseq_t<N>());
 }
 
 template <typename T>
-constexpr T partial_masks[] = { internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
-                                internal::allones<T>,
+constexpr T partial_masks[] = { constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
                                 T(),
                                 T(),
                                 T(),
diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp
@@ -61,10 +61,10 @@ CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
     return finalfn(value);
 }
 
-template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, cpu_t cpu = cpu_t::native>
+template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, KFR_ARCH_DEP>
 struct expression_reduce : output_expression
 {
-    constexpr static size_t width = vector_width<T, cpu> * bitness_const(1, 2);
+    constexpr static size_t width = platform<T>::vector_width * bitness_const(1, 2);
 
     using value_type = T;
 
@@ -85,16 +85,16 @@ struct expression_reduce : output_expression
 
 protected:
     void reset() { counter = 0; }
-    CMT_INLINE void process(vec<T, width> x) const { value = reducefn(transformfn(x), value); }
+    CMT_INLINE void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); }
 
     template <size_t N, KFR_ENABLE_IF(N < width)>
-    CMT_INLINE void process(vec<T, N> x) const
+    CMT_INLINE void process(const vec<T, N>& x) const
     {
         value = combine(value, reducefn(transformfn(x), narrow<N>(value)));
     }
 
     template <size_t N, KFR_ENABLE_IF(N > width)>
-    CMT_INLINE void process(vec<T, N> x) const
+    CMT_INLINE void process(const vec<T, N>& x) const
     {
         process(low(x));
         process(high(x));
@@ -110,14 +110,14 @@ protected:
 
 template <typename ReduceFn, typename TransformFn = fn::pass_through, typename FinalFn = fn::pass_through,
           typename E1, typename T = value_type_of<E1>>
-KFR_SINTRIN T reduce(E1&& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn::pass_through(),
+KFR_SINTRIN T reduce(const E1& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn::pass_through(),
                      FinalFn&& finalfn = fn::pass_through())
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     using reducer_t = internal::expression_reduce<T, decay<ReduceFn>, decay<TransformFn>, decay<FinalFn>>;
     reducer_t red(std::forward<ReduceFn>(reducefn), std::forward<TransformFn>(transformfn),
                   std::forward<FinalFn>(finalfn));
-    process(red, std::forward<E1>(e1));
+    process(red, e1);
 
     return red.get();
 }
@@ -133,10 +133,10 @@ KFR_FN(reduce)
  * \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T sum(E1&& x)
+KFR_SINTRIN T sum(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::add());
+    return reduce(x, fn::add());
 }
 
 /**
@@ -148,10 +148,10 @@ KFR_SINTRIN T sum(E1&& x)
  * \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T mean(E1&& x)
+KFR_SINTRIN T mean(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::add(), fn::pass_through(), fn::final_mean());
+    return reduce(x, fn::add(), fn::pass_through(), fn::final_mean());
 }
 
 /**
@@ -160,10 +160,10 @@ KFR_SINTRIN T mean(E1&& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T minof(E1&& x)
+KFR_SINTRIN T minof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::min());
+    return reduce(x, fn::min());
 }
 
 /**
@@ -172,10 +172,10 @@ KFR_SINTRIN T minof(E1&& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T maxof(E1&& x)
+KFR_SINTRIN T maxof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::max());
+    return reduce(x, fn::max());
 }
 
 /**
@@ -184,10 +184,10 @@ KFR_SINTRIN T maxof(E1&& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T absminof(E1&& x)
+KFR_SINTRIN T absminof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::absmin());
+    return reduce(x, fn::absmin());
 }
 
 /**
@@ -196,10 +196,10 @@ KFR_SINTRIN T absminof(E1&& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T absmaxof(E1&& x)
+KFR_SINTRIN T absmaxof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::absmax());
+    return reduce(x, fn::absmax());
 }
 
 /**
@@ -229,10 +229,10 @@ KFR_SINTRIN T dotproduct(E1&& x, E2&& y)
    \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T rms(E1&& x)
+KFR_SINTRIN T rms(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::add(), fn::sqr(), fn::final_rootmean());
+    return reduce(x, fn::add(), fn::sqr(), fn::final_rootmean());
 }
 
 /**
@@ -244,10 +244,10 @@ KFR_SINTRIN T rms(E1&& x)
    \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T sumsqr(E1&& x)
+KFR_SINTRIN T sumsqr(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::add(), fn::sqr());
+    return reduce(x, fn::add(), fn::sqr());
 }
 
 /**
@@ -259,9 +259,9 @@ KFR_SINTRIN T sumsqr(E1&& x)
    \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T product(E1&& x)
+KFR_SINTRIN T product(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(std::forward<E1>(x), fn::mul());
+    return reduce(x, fn::mul());
 }
 }
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -121,12 +121,12 @@ KFR_SINTRIN i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
 }
 #endif
 
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
 KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
     return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c)));
 }
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
 KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
     return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
@@ -178,12 +178,12 @@ KFR_SINTRIN f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y
 }
 #endif
 
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
 KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
     return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c)));
 }
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
 KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
 {
     return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
diff --git a/include/kfr/base/shuffle.hpp b/include/kfr/base/shuffle.hpp
@@ -40,7 +40,7 @@ namespace internal
 template <size_t index, typename T>
 constexpr CMT_INLINE T broadcast_get_nth()
 {
-    return c_qnan<T>;
+    return constants<T>::qnan;
 }
 
 template <size_t index, typename T, typename... Ts>
@@ -51,17 +51,17 @@ constexpr CMT_INLINE T broadcast_get_nth(T x, Ts... rest)
 
 template <typename T, typename... Ts, size_t... indices, size_t Nin = 1 + sizeof...(Ts),
           size_t Nout = sizeof...(indices)>
-CMT_INLINE constexpr vec<T, Nout> broadcast_helper(csizes_t<indices...>, T x, Ts... rest)
+CMT_GNU_CONSTEXPR CMT_INLINE vec<T, Nout> broadcast_helper(csizes_t<indices...>, T x, Ts... rest)
 {
-    simd<T, Nout> result{ broadcast_get_nth<indices % Nin>(x, rest...)... };
-    return result;
+    using simd_t = simd<T, Nout>;
+    return KFR_SIMD_SET(simd_t, broadcast_get_nth<indices % Nin>(x, rest...)...);
 }
 }
 
 template <size_t Nout, typename T, typename... Ts>
-constexpr CMT_INLINE vec<T, Nout> broadcast(T x, T y, Ts... rest)
+CMT_GNU_CONSTEXPR CMT_INLINE vec<T, Nout> broadcast(T x, T y, Ts... rest)
 {
-    return internal::broadcast_helper(csizeseq<Nout>, x, y, rest...);
+    return internal::broadcast_helper(csizeseq_t<Nout>(), x, y, rest...);
 }
 KFR_FN(broadcast)
 
@@ -215,10 +215,9 @@ struct shuffle_index_shuffle
     constexpr static size_t indexcount = sizeof...(Indices);
 
     template <size_t index>
-    constexpr inline size_t operator()() const
+    constexpr inline size_t operator()(csize_t<index>) const
     {
-        constexpr int result = csizes_t<Indices...>::get(csize<index % indexcount>);
-        return result + index / indexcount * indexcount;
+        return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount;
     }
 };
 }
@@ -247,11 +246,9 @@ struct shuffle_index_permute
     constexpr static size_t indexcount = sizeof...(Indices);
 
     template <size_t index>
-    constexpr inline size_t operator()() const
+    constexpr inline size_t operator()(csize_t<index>) const
     {
-        constexpr size_t result = csizes_t<Indices...>::get(csize<index % indexcount>);
-        static_assert(result < size, "result < size");
-        return result + index / indexcount * indexcount;
+        return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount;
     }
 };
 }
@@ -341,9 +338,9 @@ struct shuffle_index_blend
     constexpr static size_t indexcount = sizeof...(Indices);
 
     template <size_t index>
-    constexpr inline size_t operator()() const
+    constexpr inline size_t operator()(csize_t<index>) const
     {
-        return (elements_t<Indices...>::get(csize<index % indexcount>) ? size : 0) + index % size;
+        return (elements_t<Indices...>::get(csize_t<index % indexcount>()) ? size : 0) + index % size;
     }
 };
 }
@@ -434,8 +431,7 @@ struct shuffle_index_transpose
 {
     constexpr inline size_t operator()(size_t index) const
     {
-        constexpr size_t side2 = size / side1;
-        return index % side2 * side1 + index / side2;
+        return index % (size / side1) * side1 + index / (size / side1);
     }
 };
 }
diff --git a/include/kfr/base/simd.hpp b/include/kfr/base/simd.hpp
@@ -28,6 +28,9 @@
 #include "kfr.h"
 #include "types.hpp"
 
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4324))
+
 namespace kfr
 {
 
@@ -72,10 +75,11 @@ CMT_INLINE simd<T, N> simd_read(const T* src)
 template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
 CMT_INLINE simd<T, N> simd_read(const T* src)
 {
-    constexpr size_t first        = prev_poweroftwo(N);
-    constexpr size_t rest         = N - first;
-    constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>);
-    constexpr auto concat_indices = csizeseq<N>;
+    constexpr size_t first = prev_poweroftwo(N);
+    constexpr size_t rest  = N - first;
+    constexpr auto extend_indices =
+        cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>());
+    constexpr auto concat_indices = cvalseq_t<size_t, N>();
     return simd_shuffle<T, first>(simd_read<first, A>(src),
                                   simd_shuffle<T, rest>(simd_read<rest, false>(src + first), extend_indices),
                                   concat_indices);
@@ -92,10 +96,11 @@ CMT_INLINE void simd_write(T* dest, const simd<T, N>& value)
 {
     constexpr size_t first = prev_poweroftwo(N);
     constexpr size_t rest  = N - first;
-    simd_write<A, first>(dest, simd_shuffle(value, csizeseq<first>));
-    simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq<rest, first>));
+    simd_write<A, first>(dest, simd_shuffle(value, csizeseq_t<first>()));
+    simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq_t<rest, first>()));
 }
 
+#define KFR_SIMD_SET(T, ...) (T{ __VA_ARGS__ })
 #define KFR_SIMD_CAST(T, N, X) __builtin_convertvector(X, ::kfr::simd<T, N>)
 #define KFR_SIMD_BITCAST(T, N, X) ((::kfr::simd<T, N>)(X))
 #define KFR_SIMD_BROADCAST(T, N, X) ((::kfr::simd<T, N>)(X))
@@ -107,6 +112,9 @@ namespace internal
 {
 
 template <typename T>
+constexpr inline T maskbits(bool value);
+
+template <typename T>
 struct simd_float_ops
 {
     constexpr static T neg(T x) { return -x; }
@@ -144,74 +152,74 @@ struct simd_int_ops : simd_float_ops<T>
 }
 
 template <typename T, size_t N>
-struct alignas(next_poweroftwo(N * sizeof(T))) simd
+struct alignas(const_min(size_t(64), next_poweroftwo(N * sizeof(T)))) simd
 {
     using ops =
         conditional<std::is_floating_point<T>::value, internal::simd_float_ops<T>, internal::simd_int_ops<T>>;
-    constexpr static simd broadcast(T value) { return broadcast_impl(value, csizeseq<N>); }
+    constexpr static simd broadcast(T value) { return broadcast_impl(value, cvalseq_t<size_t, N>()); }
     constexpr friend simd operator+(const simd& x) { return x; }
-    constexpr friend simd operator-(const simd& x) { return op_impl<ops::neg>(x, csizeseq<N>); }
-    constexpr friend simd operator~(const simd& x) { return op_impl<ops::bnot>(x, csizeseq<N>); }
+    constexpr friend simd operator-(const simd& x) { return op_impl<ops::neg>(x, cvalseq_t<size_t, N>()); }
+    constexpr friend simd operator~(const simd& x) { return op_impl<ops::bnot>(x, cvalseq_t<size_t, N>()); }
 
     constexpr friend simd operator+(const simd& x, const simd& y)
     {
-        return op_impl<ops::add>(x, y, csizeseq<N>);
+        return op_impl<ops::add>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator-(const simd& x, const simd& y)
     {
-        return op_impl<ops::sub>(x, y, csizeseq<N>);
+        return op_impl<ops::sub>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator*(const simd& x, const simd& y)
     {
-        return op_impl<ops::mul>(x, y, csizeseq<N>);
+        return op_impl<ops::mul>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator/(const simd& x, const simd& y)
     {
-        return op_impl<ops::div>(x, y, csizeseq<N>);
+        return op_impl<ops::div>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator&(const simd& x, const simd& y)
     {
-        return op_impl<ops::band>(x, y, csizeseq<N>);
+        return op_impl<ops::band>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator|(const simd& x, const simd& y)
     {
-        return op_impl<ops::bor>(x, y, csizeseq<N>);
+        return op_impl<ops::bor>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator^(const simd& x, const simd& y)
     {
-        return op_impl<ops::bxor>(x, y, csizeseq<N>);
+        return op_impl<ops::bxor>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator<<(const simd& x, const simd& y)
     {
-        return op_impl<ops::shl>(x, y, csizeseq<N>);
+        return op_impl<ops::shl>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator>>(const simd& x, const simd& y)
     {
-        return op_impl<ops::shr>(x, y, csizeseq<N>);
+        return op_impl<ops::shr>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator==(const simd& x, const simd& y)
     {
-        return op_impl<ops::eq>(x, y, csizeseq<N>);
+        return op_impl<ops::eq>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator!=(const simd& x, const simd& y)
     {
-        return op_impl<ops::ne>(x, y, csizeseq<N>);
+        return op_impl<ops::ne>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator<(const simd& x, const simd& y)
     {
-        return op_impl<ops::lt>(x, y, csizeseq<N>);
+        return op_impl<ops::lt>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator>(const simd& x, const simd& y)
     {
-        return op_impl<ops::gt>(x, y, csizeseq<N>);
+        return op_impl<ops::gt>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator<=(const simd& x, const simd& y)
     {
-        return op_impl<ops::le>(x, y, csizeseq<N>);
+        return op_impl<ops::le>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr friend simd operator>=(const simd& x, const simd& y)
     {
-        return op_impl<ops::ge>(x, y, csizeseq<N>);
+        return op_impl<ops::ge>(x, y, cvalseq_t<size_t, N>());
     }
     constexpr T operator[](size_t index) const { return items[index]; }
     T& operator[](size_t index) { return items[index]; }
@@ -220,29 +228,29 @@ struct alignas(next_poweroftwo(N * sizeof(T))) simd
     template <typename U>
     constexpr simd<U, N> cast() const
     {
-        return cast_impl<U>(*this, csizeseq<N>);
+        return cast_impl<U>(*this, cvalseq_t<size_t, N>());
     }
 
 private:
     template <typename U, size_t... indices>
     constexpr static simd<U, N> cast_impl(const simd& x, csizes_t<indices...>)
     {
-        return simd<U, N>{ static_cast<U>(x.items[indices])... };
+        return simd<U, N>{ { static_cast<U>(x.items[indices])... } };
     }
     template <T (*fn)(T), size_t... indices>
     constexpr static simd op_impl(const simd& x, csizes_t<indices...>)
     {
-        return simd{ fn(x.items[indices])... };
+        return simd{ { fn(x.items[indices])... } };
     }
     template <T (*fn)(T, T), size_t... indices>
     constexpr static simd op_impl(const simd& x, const simd& y, csizes_t<indices...>)
     {
-        return simd{ fn(x.items[indices], y.items[indices])... };
+        return simd{ { fn(x.items[indices], y.items[indices])... } };
     }
     template <size_t... indices>
     constexpr static simd broadcast_impl(T value, csizes_t<indices...>)
     {
-        return simd{ ((void)indices, value)... };
+        return simd{ { ((void)indices, value)... } };
     }
 };
 
@@ -256,12 +264,12 @@ template <typename T, size_t N, int... indices>
 constexpr CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const simd<T, N>& x, const simd<T, N>& y,
                                                               cints_t<indices...>) noexcept
 {
-    return simd<T, sizeof...(indices)>{ (indices == -1 ? T()
-                                                       : ((indices >= N) ? y[indices - N] : x[indices]))... };
+    return simd<T, sizeof...(indices)>{ { (
+        indices == -1 ? T() : ((indices >= N) ? y[indices - N] : x[indices]))... } };
 }
 
 template <typename To, typename From, size_t N, size_t Nout = N * sizeof(From) / sizeof(To)>
-constexpr CMT_INLINE simd<To, Nout> simd_bitcast(const simd<From, N>& value) noexcept
+CMT_INLINE simd<To, Nout> simd_bitcast(const simd<From, N>& value) noexcept
 {
     union {
         const simd<From, N> from;
@@ -297,19 +305,22 @@ CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, ctrue_t)
 template <size_t N, bool A = false, typename T>
 CMT_INLINE simd<T, N> simd_read(const T* src)
 {
-    return simd_read_impl<N>(src, cbool<A>);
+    return simd_read_impl<N>(src, cbool_t<A>());
 }
 
 template <bool A = false, size_t N, typename T>
 CMT_INLINE void simd_write(T* dest, const simd<T, N>& value)
 {
-    return simd_write_impl<N>(dest, value, cbool<A>);
+    return simd_write_impl<N>(dest, value, cbool_t<A>());
 }
 
-#define KFR_SIMD_CAST(T, N, X) (::kfr::simd_cast<T>(X))
-#define KFR_SIMD_BITCAST(T, N, X) (::kfr::simd_bitcast<T>(X))
+#define KFR_SIMD_SET(T, ...) (T{ { __VA_ARGS__ } })
+#define KFR_SIMD_CAST(T, N, X) ((void)N, ::kfr::simd_cast<T>(X))
+#define KFR_SIMD_BITCAST(T, N, X) ((void)N, ::kfr::simd_bitcast<T>(X))
 #define KFR_SIMD_BROADCAST(T, N, X) (::kfr::simd<T, N>::broadcast(X))
-#define KFR_SIMD_SHUFFLE(X, Y, ...) simd_shuffle(X, Y, cints<__VA_ARGS__>)
+#define KFR_SIMD_SHUFFLE(X, Y, ...) simd_shuffle(X, Y, cints_t<__VA_ARGS__>())
 
 #endif
 }
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -35,7 +35,7 @@
 #include "shuffle.hpp"
 
 #if CMT_HAS_WARNING("-Wc99-extensions")
-#pragma clang diagnostic ignored "-Wc99-extensions"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
 #endif
 
 namespace kfr
@@ -44,16 +44,6 @@ namespace kfr
 namespace intrinsics
 {
 
-template <typename T>
-constexpr static T fold_constant_div = choose_const<T>(0x1.921fb6p-1f, 0x1.921fb54442d18p-1);
-
-template <typename T>
-constexpr static T fold_constant_hi = choose_const<T>(0x1.922000p-1f, 0x1.921fb40000000p-1);
-template <typename T>
-constexpr static T fold_constant_rem1 = choose_const<T>(-0x1.2ae000p-19f, 0x1.4442d00000000p-25);
-template <typename T>
-constexpr static T fold_constant_rem2 = choose_const<T>(-0x1.de973ep-32f, 0x1.8469898cc5170p-49);
-
 template <typename T, size_t N>
 KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0)
 {
@@ -70,9 +60,9 @@ KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, con
 template <typename T, size_t N, typename Tprecise = f64>
 KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
 {
-    const vec<T, N> xabs    = abs(x);
-    constexpr vec<T, N> div = fold_constant_div<T>;
-    vec<T, N> y             = floor(xabs / div);
+    const vec<T, N> xabs = abs(x);
+    constexpr T div = constants<T>::fold_constant_div;
+    vec<T, N> y = floor(xabs / div);
     quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
 
     const mask<T, N> msk = bitcast<T>((quadrant & 1) != 0);
@@ -80,25 +70,25 @@ KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
     y        = select(msk, y + T(1.0), y);
     quadrant = quadrant & 7;
 
-    constexpr vec<Tprecise, N> hi = cast<Tprecise>(fold_constant_hi<T>);
-    constexpr vec<T, N> rem1      = fold_constant_rem1<T>;
-    constexpr vec<T, N> rem2      = fold_constant_rem2<T>;
+    constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi);
+    constexpr T rem1      = constants<T>::fold_constant_rem1;
+    constexpr T rem2      = constants<T>::fold_constant_rem2;
     return cast<T>(cast<Tprecise>(xabs) - cast<Tprecise>(y) * hi) - y * rem1 - y * rem2;
 }
 
 template <size_t N>
 KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask)
 {
-    constexpr f32 sin_c2  = -0x2.aaaaacp-4f;
-    constexpr f32 sin_c4  = 0x2.222334p-8f;
-    constexpr f32 sin_c6  = -0xd.0566ep-16f;
-    constexpr f32 sin_c8  = 0x3.64cc1cp-20f;
-    constexpr f32 sin_c10 = -0x5.6c4a4p-24f;
-    constexpr f32 cos_c2  = -0x8.p-4f;
-    constexpr f32 cos_c4  = 0xa.aaaabp-8f;
-    constexpr f32 cos_c6  = -0x5.b05d48p-12f;
-    constexpr f32 cos_c8  = 0x1.a065f8p-16f;
-    constexpr f32 cos_c10 = -0x4.cd156p-24f;
+    constexpr f32 sin_c2  = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f);
+    constexpr f32 sin_c4  = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f);
+    constexpr f32 sin_c6  = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f);
+    constexpr f32 sin_c8  = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f);
+    constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f);
+    constexpr f32 cos_c2  = CMT_FP(-0x8.p-4f, -5.0000000000e-01f);
+    constexpr f32 cos_c4  = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f);
+    constexpr f32 cos_c6  = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f);
+    constexpr f32 cos_c8  = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f);
+    constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f);
 
     const vec<f32, N> x2 = folded * folded;
 
@@ -112,22 +102,22 @@ KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N
 template <size_t N>
 KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask)
 {
-    constexpr f64 sin_c2  = -0x2.aaaaaaaaaaaaap-4;
-    constexpr f64 sin_c4  = 0x2.22222222220cep-8;
-    constexpr f64 sin_c6  = -0xd.00d00cffd6618p-16;
-    constexpr f64 sin_c8  = 0x2.e3bc744fb879ep-20;
-    constexpr f64 sin_c10 = -0x6.b99034c1467a4p-28;
-    constexpr f64 sin_c12 = 0xb.0711ea8fe8ee8p-36;
-    constexpr f64 sin_c14 = -0xb.7e010897e55dp-44;
-    constexpr f64 sin_c16 = -0xb.64eac07f1d6bp-48;
-    constexpr f64 cos_c2  = -0x8.p-4;
-    constexpr f64 cos_c4  = 0xa.aaaaaaaaaaaa8p-8;
-    constexpr f64 cos_c6  = -0x5.b05b05b05ad28p-12;
-    constexpr f64 cos_c8  = 0x1.a01a01a0022e6p-16;
-    constexpr f64 cos_c10 = -0x4.9f93ed845de2cp-24;
-    constexpr f64 cos_c12 = 0x8.f76bc015abe48p-32;
-    constexpr f64 cos_c14 = -0xc.9bf2dbe00379p-40;
-    constexpr f64 cos_c16 = 0xd.1232ac32f7258p-48;
+    constexpr f64 sin_c2  = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01);
+    constexpr f64 sin_c4  = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03);
+    constexpr f64 sin_c6  = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04);
+    constexpr f64 sin_c8  = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06);
+    constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08);
+    constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10);
+    constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13);
+    constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14);
+    constexpr f64 cos_c2  = CMT_FP(-0x8.p-4, -5.000000000000000000e-01);
+    constexpr f64 cos_c4  = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02);
+    constexpr f64 cos_c6  = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03);
+    constexpr f64 cos_c8  = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05);
+    constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07);
+    constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09);
+    constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11);
+    constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14);
 
     vec<f64, N> x2 = folded * folded;
     vec<f64, N> formula =
@@ -193,7 +183,7 @@ KFR_SINTRIN vec<T, N> cos(const vec<T, N>& x)
 template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
 KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x)
 {
-    constexpr vec<T, N> msk = broadcast<N>(internal::highbitmask<T>);
+    CMT_GNU_CONSTEXPR vec<T, N> msk = broadcast<N>(constants<T>::highbitmask());
 
     constexpr static T c2 = -0.16665853559970855712890625;
     constexpr static T c4 = +8.31427983939647674560546875e-3;
@@ -238,7 +228,7 @@ KFR_SINTRIN vec<T, N> cossin(const vec<T, N>& x)
 template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
 KFR_SINTRIN vec<T, N> sinc(const vec<T, N>& x)
 {
-    return select(abs(x) <= c_epsilon<T>, T(1), sin(x) / x);
+    return select(abs(x) <= constants<T>::epsilon, T(1), sin(x) / x);
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
@@ -294,37 +284,37 @@ KFR_I_FLT_CONVERTER(sinc)
 template <typename T, typename Tout = flt_type<T>>
 KFR_SINTRIN Tout sindeg(const T& x)
 {
-    return sin(x * c_degtorad<Tout>);
+    return sin(x * constants<Tout>::degtorad);
 }
 
 template <typename T, typename Tout = flt_type<T>>
 KFR_SINTRIN Tout cosdeg(const T& x)
 {
-    return cos(x * c_degtorad<Tout>);
+    return cos(x * constants<Tout>::degtorad);
 }
 
 template <typename T, typename Tout = flt_type<T>>
 KFR_SINTRIN Tout fastsindeg(const T& x)
 {
-    return fastsin(x * c_degtorad<Tout>);
+    return fastsin(x * constants<Tout>::degtorad);
 }
 
 template <typename T, typename Tout = flt_type<T>>
 KFR_SINTRIN Tout fastcosdeg(const T& x)
 {
-    return fastcos(x * c_degtorad<Tout>);
+    return fastcos(x * constants<Tout>::degtorad);
 }
 
 template <typename T, typename Tout = flt_type<T>>
 KFR_SINTRIN Tout sincosdeg(const T& x)
 {
-    return sincos(x * c_degtorad<Tout>);
+    return sincos(x * constants<Tout>::degtorad);
 }
 
 template <typename T, typename Tout = flt_type<T>>
 KFR_SINTRIN Tout cossindeg(const T& x)
 {
-    return cossin(x * c_degtorad<Tout>);
+    return cossin(x * constants<Tout>::degtorad);
 }
 }
 
diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp
@@ -45,7 +45,7 @@ CMT_INLINE vec<T, N> sort(const vec<T, N>& x)
     constexpr size_t Nhalf = N / 2;
     vec<T, Nhalf> e = low(x);
     vec<T, Nhalf> o = high(x);
-    constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
+    constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>());
     for (size_t i = 0; i < Nhalf; i++)
     {
         vec<T, Nhalf> t;
@@ -78,7 +78,7 @@ CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x)
     constexpr size_t Nhalf = N / 2;
     vec<T, Nhalf> e = low(x);
     vec<T, Nhalf> o = high(x);
-    constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
+    constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>());
     for (size_t i = 0; i < Nhalf; i++)
     {
         vec<T, Nhalf> t;
diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp
@@ -64,19 +64,19 @@ KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full)
     mask<f32, N> inverse;
     const vec<f32, N> x = trig_fold_simple(x_full, inverse);
 
-    constexpr f32 tan_c2  = 0x5.555378p-4;
-    constexpr f32 tan_c4  = 0x2.225bb8p-4;
-    constexpr f32 tan_c6  = 0xd.ac3fep-8;
-    constexpr f32 tan_c8  = 0x6.41644p-8;
-    constexpr f32 tan_c10 = 0xc.bfe7ep-12;
-    constexpr f32 tan_c12 = 0x2.6754dp-8;
-
-    constexpr f32 cot_c2  = -0x5.555558p-4;
-    constexpr f32 cot_c4  = -0x5.b0581p-8;
-    constexpr f32 cot_c6  = -0x8.ac5ccp-12;
-    constexpr f32 cot_c8  = -0xd.aaa01p-16;
-    constexpr f32 cot_c10 = -0x1.a9a9b4p-16;
-    constexpr f32 cot_c12 = -0x6.f7d4dp-24;
+    constexpr f32 tan_c2  = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01);
+    constexpr f32 tan_c4  = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01);
+    constexpr f32 tan_c6  = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02);
+    constexpr f32 tan_c8  = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02);
+    constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03);
+    constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03);
+
+    constexpr f32 cot_c2  = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01);
+    constexpr f32 cot_c4  = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02);
+    constexpr f32 cot_c6  = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03);
+    constexpr f32 cot_c8  = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04);
+    constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05);
+    constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07);
 
     const vec<f32, N> x2  = x * x;
     const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6,
@@ -92,27 +92,27 @@ KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full)
     mask<f64, N> inverse;
     const vec<f64, N> x = trig_fold_simple(x_full, inverse);
 
-    constexpr f64 tan_c2  = 0x5.5555554d8e5b8p-4;
-    constexpr f64 tan_c4  = 0x2.222224820264p-4;
-    constexpr f64 tan_c6  = 0xd.d0d90de32b3e8p-8;
-    constexpr f64 tan_c8  = 0x5.99723bdcf5cacp-8;
-    constexpr f64 tan_c10 = 0x2.434a142e413ap-8;
-    constexpr f64 tan_c12 = 0xf.2b59061305efp-12;
-    constexpr f64 tan_c14 = 0x4.a12565071a664p-12;
-    constexpr f64 tan_c16 = 0x4.dada3797ac1bcp-12;
-    constexpr f64 tan_c18 = -0x1.a74976b6ea3f3p-12;
-    constexpr f64 tan_c20 = 0x1.d06a5ae5e4a74p-12;
-
-    constexpr f64 cot_c2  = -0x5.5555555555554p-4;
-    constexpr f64 cot_c4  = -0x5.b05b05b05b758p-8;
-    constexpr f64 cot_c6  = -0x8.ab355dffc79a8p-12;
-    constexpr f64 cot_c8  = -0xd.debbca405c9f8p-16;
-    constexpr f64 cot_c10 = -0x1.66a8edb99b15p-16;
-    constexpr f64 cot_c12 = -0x2.450239be0ee92p-20;
-    constexpr f64 cot_c14 = -0x3.ad6ddb4719438p-24;
-    constexpr f64 cot_c16 = -0x5.ff4c42741356p-28;
-    constexpr f64 cot_c18 = -0x9.06881bcdf3108p-32;
-    constexpr f64 cot_c20 = -0x1.644abedc113cap-32;
+    constexpr f64 tan_c2  = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01);
+    constexpr f64 tan_c4  = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01);
+    constexpr f64 tan_c6  = CMT_FP(0xd.d0d90de32b3e8p-8, 5.396801556632355862e-02);
+    constexpr f64 tan_c8  = CMT_FP(0x5.99723bdcf5cacp-8, 2.187265359403693307e-02);
+    constexpr f64 tan_c10 = CMT_FP(0x2.434a142e413ap-8, 8.839254309582239566e-03);
+    constexpr f64 tan_c12 = CMT_FP(0xf.2b59061305efp-12, 3.703449009834865711e-03);
+    constexpr f64 tan_c14 = CMT_FP(0x4.a12565071a664p-12, 1.130243370829653185e-03);
+    constexpr f64 tan_c16 = CMT_FP(0x4.dada3797ac1bcp-12, 1.185276423238536747e-03);
+    constexpr f64 tan_c18 = CMT_FP(-0x1.a74976b6ea3f3p-12, -4.036779095551438937e-04);
+    constexpr f64 tan_c20 = CMT_FP(0x1.d06a5ae5e4a74p-12, 4.429010863244216712e-04);
+
+    constexpr f64 cot_c2  = CMT_FP(-0x5.5555555555554p-4, -3.333333333333333148e-01);
+    constexpr f64 cot_c4  = CMT_FP(-0x5.b05b05b05b758p-8, -2.222222222222377391e-02);
+    constexpr f64 cot_c6  = CMT_FP(-0x8.ab355dffc79a8p-12, -2.116402116358796163e-03);
+    constexpr f64 cot_c8  = CMT_FP(-0xd.debbca405c9f8p-16, -2.116402122295888289e-04);
+    constexpr f64 cot_c10 = CMT_FP(-0x1.66a8edb99b15p-16, -2.137779458737224013e-05);
+    constexpr f64 cot_c12 = CMT_FP(-0x2.450239be0ee92p-20, -2.164426049513111728e-06);
+    constexpr f64 cot_c14 = CMT_FP(-0x3.ad6ddb4719438p-24, -2.191935496317727080e-07);
+    constexpr f64 cot_c16 = CMT_FP(-0x5.ff4c42741356p-28, -2.234152473099993830e-08);
+    constexpr f64 cot_c18 = CMT_FP(-0x9.06881bcdf3108p-32, -2.101416316020595077e-09);
+    constexpr f64 cot_c20 = CMT_FP(-0x1.644abedc113cap-32, -3.240456633529511097e-10);
 
     const vec<f64, N> x2  = x * x;
     const vec<f64, N> val = trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6,
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -30,8 +30,8 @@
 
 #include <cmath>
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshadow"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 
 #include "../cometa.hpp"
 
@@ -210,11 +210,14 @@ using returns = cometa::fn_returns<T>;
 }
 
 template <typename T>
-using ftype = deep_rebind<T, float_type<typebits<deep_subtype<T>>::bits>>;
+using ftype =
+    typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>;
 template <typename T>
-using itype = deep_rebind<T, int_type<typebits<deep_subtype<T>>::bits>>;
+using itype =
+    typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>;
 template <typename T>
-using utype = deep_rebind<T, unsigned_type<typebits<deep_subtype<T>>::bits>>;
+using utype =
+    typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>;
 
 template <typename T>
 using fsubtype = ftype<subtype<T>>;
@@ -301,8 +304,8 @@ CMT_INLINE void zeroize(T1& value)
     builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1));
 }
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wattributes"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes")
 
 template <typename T, bool A>
 struct struct_with_alignment
@@ -322,7 +325,7 @@ __attribute__((__packed__, __may_alias__)) //
 #endif
 ;
 
-#pragma GCC diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 }
 
 template <typename T>
@@ -330,45 +333,6 @@ struct initialvalue
 {
 };
 
-#if CMT_COMPILER_GNU
-constexpr double infinity = __builtin_inf();
-constexpr double qnan     = __builtin_nan("");
-#else
-constexpr double infinity = HUGE_VAL;
-constexpr double qnan     = NAN;
-#endif
-
-namespace internal
-{
-#if CMT_COMPILER_GNU
-constexpr f32 allones_f32 = -__builtin_nanf("0xFFFFFFFF");
-constexpr f64 allones_f64 = -__builtin_nan("0xFFFFFFFFFFFFFFFF");
-#else
-constexpr f32 allones_f32 = -NAN;
-constexpr f64 allones_f64 = -NAN;
-#endif
-
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub allones = choose_const<Tsub>(allones_f32, allones_f64, static_cast<Tsub>(-1));
-
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub allzeros = Tsub();
-
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub highbitmask = choose_const<Tsub>(-0.f, -0.0, 1ull << (typebits<T>::bits - 1));
-
-template <typename T, typename Tsub = subtype<T>>
-constexpr Tsub invhighbitmask = choose_const<Tsub>(__builtin_nanf("0xFFFFFFFF"),
-                                                   __builtin_nan("0xFFFFFFFFFFFFFFFF"),
-                                                   ~(1ull << (typebits<T>::bits - 1)));
-
-template <typename T>
-constexpr inline T maskbits(bool value)
-{
-    return value ? internal::allones<T> : T();
-}
-}
-
 namespace internal
 {
 template <size_t width, typename Fn>
@@ -376,7 +340,7 @@ CMT_INLINE void block_process_impl(size_t& i, size_t size, Fn&& fn)
 {
     CMT_LOOP_NOUNROLL
     for (; i < size / width * width; i += width)
-        fn(i, csize<width>);
+        fn(i, csize_t<width>());
 }
 }
 
@@ -388,4 +352,4 @@ CMT_INLINE void block_process(size_t size, csizes_t<widths...>, Fn&& fn)
 }
 }
 
-#pragma GCC diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp
@@ -32,6 +32,9 @@
 #include "read_write.hpp"
 #include "types.hpp"
 
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4324))
+
 namespace kfr
 {
 
@@ -136,7 +139,7 @@ struct univector_base : input_expression, output_expression
     {
         ringbuf_write(cursor, x.data(), N);
     }
-    void ringbuf_write(size_t& cursor, const T value)
+    void ringbuf_write(size_t& cursor, const T& value)
     {
         T* data      = get_data();
         data[cursor] = value;
@@ -205,8 +208,8 @@ private:
 };
 
 template <typename T, size_t Size>
-struct alignas(maximum_vector_alignment) univector : std::array<T, Size>,
-                                                     univector_base<T, univector<T, Size>>
+struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, Size>,
+                                                                 univector_base<T, univector<T, Size>>
 {
     using std::array<T, Size>::size;
     using size_type = size_t;
@@ -216,7 +219,8 @@ struct alignas(maximum_vector_alignment) univector : std::array<T, Size>,
         this->assign_expr(std::forward<Input>(input));
     }
     template <typename... Args>
-    constexpr univector(T x, Args... args) noexcept : std::array<T, Size>{ { x, static_cast<T>(args)... } }
+    constexpr univector(const T& x, const Args&... args) noexcept
+        : std::array<T, Size>{ { x, static_cast<T>(args)... } }
     {
     }
 
@@ -344,3 +348,5 @@ CMT_INLINE univector<T> render(Expr&& expr, size_t size)
     return result;
 }
 }
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -27,15 +27,15 @@
 
 #include "kfr.h"
 
-#include "types.hpp"
-
+#include "constants.hpp"
 #include "simd.hpp"
+#include "types.hpp"
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
-#pragma clang diagnostic ignored "-Wc++98-compat-local-type-template-args"
-#pragma clang diagnostic ignored "-Wshadow"
-#pragma clang diagnostic ignored "-Wpacked"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked")
 
 namespace kfr
 {
@@ -139,6 +139,12 @@ struct mask;
 namespace internal
 {
 
+template <typename T>
+constexpr inline T maskbits(bool value)
+{
+    return value ? constants<T>::allones() : T();
+}
+
 template <typename T, size_t N>
 struct flt_type_impl<vec<T, N>>
 {
@@ -175,38 +181,36 @@ constexpr CMT_INLINE vec<To, Nout> compcast(const vec<From, N>& value) noexcept
 namespace internal
 {
 template <typename Fn, size_t index>
-constexpr enable_if<std::is_same<size_t, decltype(std::declval<Fn>().operator()(size_t()))>::value, size_t>
-get_vec_index()
+constexpr size_t get_vec_index() noexcept
 {
+#ifdef CMT_COMPILER_GNU
     constexpr Fn fn{};
-    return fn(index);
+    return fn(csize_t<index>());
+#else
+    return Fn()(csize_t<index>());
+#endif
 }
 
-template <typename Fn, size_t index>
-constexpr enable_if<
-    std::is_same<size_t, decltype(std::declval<Fn>().template operator() < index > ())>::value, size_t>
-get_vec_index(int = 0)
+constexpr int vindex(size_t index) { return index == index_undefined ? -1 : static_cast<int>(index); }
+
+template <typename T, size_t N, size_t... Indices, typename = enable_if<!(is_compound<T>::value)>,
+          typename = void>
+CMT_INLINE vec<T, sizeof...(Indices)> shufflevector_i_t(csizes_t<Indices...>, const vec<T, N>& x,
+                                                        const vec<T, N>& y)
 {
-    constexpr Fn fn{};
-    return fn.template operator()<index>();
+    vec<T, sizeof...(Indices)> result = KFR_SIMD_SHUFFLE(*x, *y, vindex(Indices)...);
+    return result;
 }
 
-template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(!is_compound<T>::value)>
-CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...>, const vec<T, N>& x,
-                                                    const vec<T, N>& y)
+constexpr size_t vinflate_index(size_t counter, size_t groupsize, size_t index)
 {
-    vec<T, sizeof...(Indices)> result = KFR_SIMD_SHUFFLE(
-        *x, *y, static_cast<intptr_t>(Indices == index_undefined ? -1 : static_cast<intptr_t>(Indices))...);
-    return result;
+    return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index);
 }
 
 template <size_t counter, size_t groupsize, size_t... indices>
 constexpr size_t inflate_get_index()
 {
-    constexpr csizes_t<indices...> ind{};
-    return (ind.get(csize<counter / groupsize>) == index_undefined
-                ? index_undefined
-                : (counter % groupsize + groupsize * ind.get(csize<counter / groupsize>)));
+    return vinflate_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>()));
 }
 
 template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)>
@@ -215,29 +219,25 @@ constexpr auto inflate_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt)
 {
     return {};
 }
-}
-
-namespace internal
-{
 
 template <size_t groupsize, size_t... indices>
 constexpr auto inflate(csize_t<groupsize>, csizes_t<indices...>)
 {
-    return inflate_impl(csizes<indices...>, csizeseq<sizeof...(indices)*groupsize>);
+    return inflate_impl(csizes_t<indices...>(), csizeseq_t<sizeof...(indices)*groupsize>());
 }
 
-template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(is_compound<T>::value)>
-CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...> indices, const vec<T, N>& x,
-                                                    const vec<T, N>& y)
+template <typename T, size_t N, size_t... Indices, typename = enable_if<(is_compound<T>::value)>>
+CMT_INLINE vec<T, sizeof...(Indices)> shufflevector_i_t(csizes_t<Indices...> indices, const vec<T, N>& x,
+                                                        const vec<T, N>& y)
 {
-    return compcast<T>(shufflevector(inflate(csize<widthof<T>()>, indices), compcast<subtype<T>>(x),
-                                     compcast<subtype<T>>(y)));
+    return compcast<T>(shufflevector_i_t(inflate(csize_t<widthof<T>()>(), indices), compcast<subtype<T>>(x),
+                                         compcast<subtype<T>>(y)));
 }
 
 template <size_t... Indices, size_t Nout = sizeof...(Indices), typename T, size_t N>
-CMT_INLINE vec<T, Nout> shufflevector(csizes_t<Indices...>, const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> shufflevector_i(csizes_t<Indices...>, const vec<T, N>& x)
 {
-    return internal::shufflevector<T, N>(csizes<Indices...>, x, x);
+    return internal::shufflevector_i_t<T, N>(csizes_t<Indices...>(), x, x);
 }
 
 template <typename Fn, size_t groupsize, typename T, size_t N, size_t... Indices,
@@ -245,23 +245,25 @@ template <typename Fn, size_t groupsize, typename T, size_t N, size_t... Indices
 CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y, cvals_t<size_t, Indices...>)
 {
     static_assert(N % groupsize == 0, "N % groupsize == 0");
-    return internal::shufflevector<T, N>(
-        csizes<(get_vec_index<Fn, Indices / groupsize>() * groupsize + Indices % groupsize)...>, x, y);
+    constexpr csizes_t<(get_vec_index<Fn, Indices / groupsize>() * groupsize + Indices % groupsize)...>
+        vindices{};
+    return internal::shufflevector_i_t<T, N>(vindices, x, y);
 }
 }
 
 template <size_t Nout, typename Fn, size_t groupsize = 1, typename T, size_t N>
 CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y)
 {
-    return internal::shufflevector<Fn, groupsize>(x, y, csizeseq<Nout>);
+    return internal::shufflevector<Fn, groupsize>(x, y, cvalseq_t<size_t, Nout>());
 }
 
 template <size_t Nout, typename Fn, size_t groupsize = 1, typename T, size_t N>
 CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x)
 {
-    return internal::shufflevector<Fn, groupsize>(x, x, csizeseq<Nout>);
+    return internal::shufflevector<Fn, groupsize>(x, x, cvalseq_t<size_t, Nout>());
 }
 
+#ifdef KFR_ENABLE_SWIZZLE
 namespace swizzle
 {
 template <size_t>
@@ -300,9 +302,10 @@ constexpr swiz<13> s13{};
 constexpr swiz<14> s14{};
 constexpr swiz<15> s15{};
 }
+#endif
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast")
 
 template <size_t N, typename T>
 constexpr CMT_INLINE vec<T, N> broadcast(T x)
@@ -310,7 +313,7 @@ constexpr CMT_INLINE vec<T, N> broadcast(T x)
     return x;
 }
 
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 
 namespace internal
 {
@@ -381,14 +384,15 @@ constexpr CMT_INLINE vec<Tsub, Nout> flatten(const vec<From, N>& value) noexcept
     return *value;
 }
 
-template <typename To, typename From, typename Tout = deep_rebind<From, To>>
+template <typename To, typename From,
+          typename Tout = typename compound_type_traits<From>::template deep_rebind<To>>
 constexpr CMT_INLINE Tout cast(const From& value) noexcept
 {
     return static_cast<Tout>(value);
 }
 
 template <typename To, typename From>
-constexpr CMT_INLINE To bitcast(const From& value) noexcept
+CMT_GNU_CONSTEXPR CMT_INLINE To bitcast(const From& value) noexcept
 {
     static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types");
     union {
@@ -399,7 +403,7 @@ constexpr CMT_INLINE To bitcast(const From& value) noexcept
 }
 
 template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
-constexpr CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
+CMT_GNU_CONSTEXPR CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
 {
     using Tsub             = typename vec<To, Nout>::scalar_type;
     constexpr size_t width = vec<To, Nout>::scalar_size();
@@ -407,7 +411,7 @@ constexpr CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
 }
 
 template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
-constexpr CMT_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept
+CMT_GNU_CONSTEXPR CMT_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept
 {
     using Tsub             = typename mask<To, Nout>::scalar_type;
     constexpr size_t width = mask<To, Nout>::scalar_size();
@@ -456,7 +460,7 @@ constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
 constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
 
 template <typename T, size_t N, size_t... Sizes>
-CMT_INLINE vec<T, N + csum(csizes<Sizes...>)> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest);
+CMT_INLINE vec<T, csum<size_t, N, Sizes...>()> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest);
 
 namespace internal
 {
@@ -567,7 +571,7 @@ CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, co
     constexpr size_t width = compound_type_traits<T>::width;
     const T list[]         = { args... };
     using simd_t           = typename vec<T, N>::simd_t;
-    return simd_t{ compound_type_traits<T>::at(list[indices / width], indices % width)... };
+    return KFR_SIMD_SET(simd_t, compound_type_traits<T>::at(list[indices / width], indices % width)...);
 }
 }
 
@@ -599,7 +603,7 @@ template <typename Type = void, typename Arg, typename... Args, size_t N = (size
           KFR_ENABLE_IF(is_number<subtype<SubType>>::value)>
 constexpr CMT_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest)
 {
-    return internal::make_vector_impl<SubType>(csizeseq<N * widthof<SubType>()>, static_cast<SubType>(x),
+    return internal::make_vector_impl<SubType>(csizeseq_t<N * widthof<SubType>()>(), static_cast<SubType>(x),
                                                static_cast<SubType>(rest)...);
 }
 KFR_FN(pack)
@@ -616,11 +620,19 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty
 {
     static_assert(N > 0 && N <= 256, "Invalid vector size");
 
+    static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value ||
+                      std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value ||
+                      std::is_same<T, short>::value || std::is_same<T, unsigned short>::value ||
+                      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+                      std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
+                      std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value ||
+                      !compound_type_traits<T>::is_scalar,
+                  "Invalid vector type");
+
     static_assert(!is_vec<T>::value || is_poweroftwo(size_of<T>()),
                   "Inner vector size must be a power of two");
 
     constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; }
-    using UT          = utype<T>;
     using value_type  = T;
     using scalar_type = subtype<T>;
     using simd_t      = simd<scalar_type, N * compound_type_traits<T>::width>;
@@ -630,34 +642,43 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty
     constexpr static bool is_pod = true;
 
     constexpr CMT_INLINE vec() noexcept {}
-    constexpr CMT_INLINE vec(simd_t value) noexcept : v(value) {}
+    constexpr CMT_INLINE vec(const simd_t& value) noexcept : v(value) {}
+    constexpr CMT_INLINE vec(const vec&) noexcept = default;
+    constexpr CMT_INLINE vec(vec&&) noexcept      = default;
+    CMT_INLINE vec& operator=(const vec&) noexcept = default;
+    CMT_INLINE vec& operator=(vec&&) noexcept = default;
+
     template <typename U,
-              KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width > 1)>
+              typename = enable_if<(std::is_convertible<U, T>::value && compound_type_traits<T>::width > 1)>>
     constexpr CMT_INLINE vec(const U& value) noexcept
         : v(*resize<scalar_size()>(bitcast<scalar_type>(make_vector(static_cast<T>(value)))))
     {
     }
     template <typename U,
-              KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width == 1)>
+              typename = enable_if<(std::is_convertible<U, T>::value && compound_type_traits<T>::width == 1)>,
+              typename = void>
     constexpr CMT_INLINE vec(const U& value) noexcept : v(KFR_SIMD_BROADCAST(T, N, static_cast<T>(value)))
     {
     }
+
+    template <typename U,
+              typename = enable_if<std::is_convertible<U, T>::value && !std::is_same<vec<U, N>, T>::value>>
+    CMT_GNU_CONSTEXPR CMT_INLINE vec(const vec<U, N>& value) noexcept
+        : v(*internal::conversion<vec<T, N>, vec<U, N>>::cast(value))
+    {
+    }
     template <typename... Ts>
-    constexpr CMT_INLINE vec(const T& x, const T& y, const Ts&... rest) noexcept
-        : v(*make_vector<T>(x, y, rest...))
+    CMT_GNU_CONSTEXPR CMT_INLINE vec(const T& x, const T& y, const Ts&... rest) noexcept
+        : v(*make_vector<T>(x, y, static_cast<T>(rest)...))
     {
         static_assert(N <= 2 + sizeof...(Ts), "Too few initializers for vec");
     }
     template <size_t N1, size_t N2, size_t... Ns>
-    constexpr CMT_INLINE vec(const vec<T, N1>& v1, const vec<T, N2>& v2,
-                             const vec<T, Ns>&... vectors) noexcept : v(*concat(v1, v2, vectors...))
+    CMT_GNU_CONSTEXPR CMT_INLINE vec(const vec<T, N1>& v1, const vec<T, N2>& v2,
+                                     const vec<T, Ns>&... vectors) noexcept : v(*concat(v1, v2, vectors...))
     {
-        static_assert(csum(csizes<N1, N2, Ns...>) == N, "Can't concat vectors: invalid csizes");
+        static_assert(csum(csizes_t<N1, N2, Ns...>()) == N, "Can't concat vectors: invalid csizes");
     }
-    constexpr CMT_INLINE vec(const vec&) noexcept = default;
-    constexpr CMT_INLINE vec(vec&&) noexcept      = default;
-    CMT_INLINE vec& operator=(const vec&) noexcept = default;
-    CMT_INLINE vec& operator=(vec&&) noexcept = default;
 
     friend CMT_INLINE vec operator-(const vec& x) { return vec_op<T, N>::neg(x.v); }
     friend CMT_INLINE vec operator~(const vec& x) { return vec_op<T, N>::bnot(x.v); }
@@ -701,11 +722,11 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty
     using array_t = T (&)[N];
     CMT_INLINE array_t arr() { return ref_cast<array_t>(v); }
 
-    template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value && !std::is_same<U, vec>::value)>
-    CMT_INLINE constexpr operator vec<U, N>() const noexcept
+    /*template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value && !std::is_same<U, vec>::value)>
+    CMT_INLINE operator vec<U, N>() const noexcept
     {
         return internal::conversion<vec<U, N>, vec<T, N>>::cast(*this);
-    }
+    }*/
 
 private:
     struct getter_setter;
@@ -815,11 +836,12 @@ struct mask : public vec<T, N>
     using type                    = T;
     constexpr static size_t width = N;
 
-    using base = vec<T, N>;
+    using simd_t = typename vec<T, N>::simd_t;
+    using base   = vec<T, N>;
 
     constexpr CMT_INLINE mask() noexcept : base() {}
 
-    constexpr CMT_INLINE mask(simd<T, N> value) noexcept : base(value) {}
+    constexpr CMT_INLINE mask(const simd_t& value) noexcept : base(value) {}
     template <size_t N1, size_t... Ns>
     constexpr CMT_INLINE mask(const mask<T, N1>& mask1, const mask<T, Ns>&... masks) noexcept
         : base(*concat(mask1, masks...))
@@ -919,32 +941,34 @@ constexpr mask<T, Nout> partial_mask_helper(csizes_t<indices...>)
 template <typename T, size_t Nout, size_t N1>
 constexpr mask<T, Nout> partial_mask()
 {
-    return internal::partial_mask_helper<T, Nout, N1>(csizeseq<Nout>);
+    return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>());
 }
 
 template <typename T, size_t N>
-CMT_INLINE vec<T, N> concat(const vec<T, N>& x)
+CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x)
 {
     return x;
 }
 
 template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<T, N1 + N2> concat(const vec<T, N1>& x, const vec<T, N2>& y)
+CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     return concattwo<0, N1 + N2>(x, y);
 }
 
 template <typename T, size_t N1, size_t N2, size_t... Sizes>
-CMT_INLINE auto concat(const vec<T, N1>& x, const vec<T, N2>& y, const vec<T, Sizes>&... args)
+CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y,
+                                                                const vec<T, Sizes>&... args)
 {
-    return concat(x, concat(y, args...));
+    return concat_impl(concat_impl(x, y), args...);
+    //    return concat(x, concat(y, args...));
 }
 }
 
 template <typename T, size_t N, size_t... Sizes>
-CMT_INLINE vec<T, N + csum(csizes<Sizes...>)> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest)
+CMT_INLINE vec<T, csum<size_t, N, Sizes...>()> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest)
 {
-    return internal::concat(x, rest...);
+    return internal::concat_impl(x, rest...);
 }
 KFR_FN(concat)
 
@@ -1239,13 +1263,14 @@ template <typename T, size_t N, typename Fn, typename... Args,
           typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
 constexpr CMT_INLINE vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args)
 {
-    return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>, arg, std::forward<Args>(args)...);
+    return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>(), arg,
+                                        std::forward<Args>(args)...);
 }
 
 template <size_t N, typename Fn, typename T = result_of<Fn()>>
 constexpr CMT_INLINE vec<T, N> apply(Fn&& fn)
 {
-    return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>);
+    return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>());
 }
 
 template <typename T, int N>
@@ -1273,26 +1298,26 @@ constexpr CMT_INLINE mask<T, Nout> make_mask(bool arg, Args... args)
 KFR_FN(make_mask)
 
 template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> zerovector()
+CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector()
 {
     constexpr size_t width = N * compound_type_traits<T>::width;
     return compcast<T>(vec<subtype<T>, width>(simd<subtype<T>, width>()));
 }
 
 template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> zerovector(vec_t<T, N>)
+CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector(vec_t<T, N>)
 {
     return zerovector<T, N>();
 }
 KFR_FN(zerovector)
 
 template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> allonesvector()
+CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector()
 {
     return zerovector<T, N>() == zerovector<T, N>();
 }
 template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>)
+CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>)
 {
     return allonesvector<T, N>();
 }
@@ -1337,7 +1362,7 @@ KFR_FN(low)
 KFR_FN(high)
 }
 
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 
 namespace cometa
 {
@@ -1355,7 +1380,7 @@ struct compound_type_traits<kfr::vec_t<T, N>>
     template <typename U>
     using rebind = kfr::vec_t<U, N>;
     template <typename U>
-    using deep_rebind = kfr::vec_t<cometa::deep_rebind<subtype, U>, N>;
+    using deep_rebind = kfr::vec_t<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
 };
 
 template <typename T, size_t N>
@@ -1370,7 +1395,7 @@ struct compound_type_traits<kfr::vec<T, N>>
     template <typename U>
     using rebind = kfr::vec<U, N>;
     template <typename U>
-    using deep_rebind = kfr::vec<cometa::deep_rebind<subtype, U>, N>;
+    using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
 
     CMT_INLINE static constexpr subtype at(const kfr::vec<T, N>& value, size_t index) { return value[index]; }
 };
@@ -1387,7 +1412,7 @@ struct compound_type_traits<kfr::mask<T, N>>
     template <typename U>
     using rebind = kfr::mask<U, N>;
     template <typename U>
-    using deep_rebind = kfr::mask<cometa::deep_rebind<subtype, U>, N>;
+    using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
 
     CMT_INLINE static constexpr subtype at(const kfr::mask<T, N>& value, size_t index)
     {
@@ -1395,6 +1420,7 @@ struct compound_type_traits<kfr::mask<T, N>>
     }
 };
 }
+
 namespace std
 {
 template <typename T1, typename T2, size_t N>
@@ -1412,6 +1438,16 @@ struct common_type<T1, kfr::vec<T2, N>>
 {
     using type = kfr::vec<typename common_type<T1, T2>::type, N>;
 };
+template <typename T1, typename T2, size_t N1, size_t N2>
+struct common_type<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>>
+{
+    using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>;
+};
+template <typename T1, typename T2, size_t N1, size_t N2>
+struct common_type<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>>
+{
+    using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>;
+};
 
 template <typename T1, typename T2, size_t N>
 struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -205,6 +205,11 @@ extern char* gets(char* __s);
 
 #if defined(__GNUC__) || defined(__clang__) // GCC, Clang
 #define CMT_COMPILER_GNU 1
+
+#if !defined(__clang__) // GCC only
+#define CMT_COMPILER_GCC 1
+#endif
+
 #define CMT_GNU_ATTRIBUTES 1
 #define CMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #if __cplusplus >= 201103L || defined __GXX_EXPERIMENTAL_CXX0X__
@@ -261,8 +266,9 @@ extern char* gets(char* __s);
 
 #endif
 
-#if defined _MSC_VER && _MSC_VER >= 1900 && defined(__clang__) &&                                            \
-    (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 9))
+#if defined _MSC_VER && _MSC_VER >= 1900 &&                                                                  \
+    (!defined(__clang__) ||                                                                                  \
+     (defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 9))))
 #define CMT_EMPTY_BASES __declspec(empty_bases)
 #else
 #define CMT_EMPTY_BASES
@@ -389,10 +395,12 @@ extern char* gets(char* __s);
 #define CMT_HAS_EXCEPTIONS 1
 #endif
 
+#if defined __has_include
 #if __has_include(<assert.h>)
 #include <assert.h>
 #define CMT_HAS_ASSERT_H 1
 #endif
+#endif
 
 #ifndef CMT_THROW
 #if CMT_HAS_EXCEPTIONS
@@ -431,8 +439,46 @@ extern char* gets(char* __s);
 #define CMT_FAST_CC __attribute__((fastcall))
 #define CMT_UNUSED __attribute__((unused))
 #define CMT_GNU_CONSTEXPR constexpr
+#define CMT_GNU_PACKED __attribute__((packed))
+#define CMT_PRAGMA_PACK_PUSH_1
+#define CMT_PRAGMA_PACK_POP
+#define CMT_FP(h, d) h
+#define CMT_PRAGMA_GNU(...) _Pragma(#__VA_ARGS__)
+#ifdef CMT_COMPILER_CLANG
+#define CMT_PRAGMA_CLANG(...) _Pragma(#__VA_ARGS__)
+#else
+#define CMT_PRAGMA_CLANG(...)
+#endif
+#ifdef CMT_COMPILER_CLANG
+#define CMT_PRAGMA_GCC(...) _Pragma(#__VA_ARGS__)
+#else
+#define CMT_PRAGMA_GCC(...)
+#endif
+#define CMT_PRAGMA_MSVC(...)
 #else
 #define CMT_FAST_CC __fastcall
 #define CMT_UNUSED
 #define CMT_GNU_CONSTEXPR
+#define CMT_GNU_PACKED
+#define CMT_PRAGMA_PACK_PUSH_1 __pragma(pack(push, 1))
+#define CMT_PRAGMA_PACK_POP __pragma(pack(pop))
+#define CMT_FP(h, d) d
+#define CMT_PRAGMA_GNU(...)
+#define CMT_PRAGMA_CLANG(...)
+#define CMT_PRAGMA_GCC(...)
+#define CMT_PRAGMA_MSVC(...) __pragma(__VA_ARGS__)
+#endif
+
+#if defined CMT_COMPILER_CLANG
+#if defined _MSC_VER
+#define CMT_COMPIER_NAME "clang-msvc"
+#else
+#define CMT_COMPIER_NAME "clang"
+#endif
+#elif defined CMT_COMPILER_GCC
+#define CMT_COMPIER_NAME "gcc"
+#elif defined CMT_COMPILER_MSVC
+#define CMT_COMPIER_NAME "msvc"
+#else
+#define CMT_COMPIER_NAME "unknown"
 #endif
diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp
@@ -10,8 +10,11 @@
 #include <type_traits>
 #include <utility>
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshadow"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4814))
 
 namespace cometa
 {
@@ -228,11 +231,12 @@ using subtype = typename compound_type_traits<T>::subtype;
 template <typename T>
 using deep_subtype = typename compound_type_traits<T>::deep_subtype;
 
-template <typename T, typename SubType>
+/*template <typename T, typename SubType>
 using rebind_subtype = typename compound_type_traits<T>::template rebind<SubType>;
 
 template <typename T, typename SubType>
 using deep_rebind = typename compound_type_traits<T>::template deep_rebind<SubType>;
+ */
 
 template <typename T>
 struct compound_type_traits<std::pair<T, T>>
@@ -247,7 +251,8 @@ struct compound_type_traits<std::pair<T, T>>
     template <typename U>
     using rebind = std::pair<U, U>;
     template <typename U>
-    using deep_rebind = std::pair<cometa::deep_rebind<subtype, U>, cometa::deep_rebind<subtype, U>>;
+    using deep_rebind = std::pair<typename compound_type_traits<subtype>::template deep_rebind<U>,
+                                  typename compound_type_traits<subtype>::template deep_rebind<U>>;
 
     CMT_INLINE static constexpr const subtype& at(const std::pair<subtype, subtype>& value, size_t index)
     {
@@ -346,27 +351,12 @@ using cuint_t = cval_t<unsigned, val>;
 template <size_t val>
 using csize_t = cval_t<size_t, val>;
 
-template <typename T, T val>
-constexpr cval_t<T, val> cval{};
-
-template <bool val>
-constexpr cbool_t<val> cbool{};
-
 using cfalse_t = cbool_t<false>;
 using ctrue_t  = cbool_t<true>;
 
 constexpr ctrue_t ctrue{};
 constexpr cfalse_t cfalse{};
 
-template <int val>
-constexpr cint_t<val> cint{};
-
-template <unsigned val>
-constexpr cuint_t<val> cuint{};
-
-template <size_t val>
-constexpr csize_t<val> csize{};
-
 namespace details
 {
 template <size_t index, typename T, T first, T... rest>
@@ -406,17 +396,17 @@ struct cvals_t : ops::empty
     using type = cvals_t<T, values...>;
     constexpr static size_t size() { return sizeof...(values); }
     template <size_t index>
-    constexpr T operator[](csize_t<index>)
+    constexpr T operator[](csize_t<index>) const
     {
-        return get(csize<index>);
+        return get(csize_t<index>());
     }
     template <size_t index>
     constexpr static T get(csize_t<index> = csize_t<index>())
     {
         return details::get_nth<index, T, values...>::value;
     }
-    constexpr static T front() { return get(csize<0>); }
-    constexpr static T back() { return get(csize<size() - 1>); }
+    constexpr static T front() { return get(csize_t<0>()); }
+    constexpr static T back() { return get(csize_t<size() - 1>()); }
 
     static const T* begin() { return array(); }
     static const T* end() { return array() + size(); }
@@ -444,6 +434,8 @@ struct cvals_t<T> : ops::empty
 template <bool... values>
 using cbools_t = cvals_t<bool, values...>;
 
+constexpr cbools_t<false, true> cfalse_true{};
+
 template <int... values>
 using cints_t = cvals_t<int, values...>;
 
@@ -459,39 +451,16 @@ using csizes_t = cvals_t<size_t, values...>;
 template <size_t... values>
 using elements_t = cvals_t<size_t, values...>;
 
-template <typename T, T... values>
-constexpr cvals_t<T, values...> cvals{};
-
-template <bool... vals>
-constexpr cbools_t<vals...> cbools{};
-
-constexpr cbools_t<false, true> cfalse_true{};
-
-template <int... vals>
-constexpr cints_t<vals...> cints{};
-
-template <char... vals>
-constexpr cchars_t<vals...> cchars{};
-
-template <unsigned... vals>
-constexpr cuints_t<vals...> cuints{};
-
-template <size_t... vals>
-constexpr csizes_t<vals...> csizes{};
-
-template <size_t... vals>
-constexpr elements_t<vals...> elements{};
-
 template <typename T>
-constexpr inline T csum(cvals_t<T>)
+constexpr inline T csum(cvals_t<T> = cvals_t<T>())
 {
     return 0;
 }
 
 template <typename T, T first, T... rest>
-constexpr inline T csum(cvals_t<T, first, rest...>)
+constexpr inline T csum(cvals_t<T, first, rest...> = cvals_t<T, first, rest...>())
 {
-    return first + csum(cvals<T, rest...>);
+    return first + csum(cvals_t<T, rest...>());
 }
 
 template <typename T>
@@ -503,7 +472,7 @@ constexpr inline T cprod(cvals_t<T>)
 template <typename T, T first, T... rest>
 constexpr inline T cprod(cvals_t<T, first, rest...>)
 {
-    return first * cprod(cvals<T, rest...>);
+    return first * cprod(cvals_t<T, rest...>());
 }
 
 template <typename T>
@@ -515,9 +484,6 @@ struct ctype_t
 template <typename T>
 using type_of = typename T::type;
 
-template <typename T>
-constexpr ctype_t<T> ctype{};
-
 template <typename... Types>
 struct ctypes_t
 {
@@ -533,8 +499,6 @@ struct ctypes_t
     }
 };
 
-template <typename... Ts>
-constexpr ctypes_t<Ts...> ctypes{};
 namespace details
 {
 template <typename T1, typename T2>
@@ -711,36 +675,12 @@ struct cvalseq_impl<T, 1, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart)>
 template <typename T, size_t size, T start = T(), ptrdiff_t step = 1>
 using cvalseq_t = typename details::cvalseq_impl<T, size, start, step>::type;
 
-template <typename T, T begin, T end>
-constexpr cvalseq_t<T, end - begin, begin> cvalrange{};
-
-template <size_t begin, size_t end>
-constexpr cvalseq_t<size_t, end - begin, begin> csizerange{};
-
-template <int begin, int end>
-constexpr cvalseq_t<int, end - begin, begin> cintrange{};
-
-template <unsigned begin, unsigned end>
-constexpr cvalseq_t<unsigned, end - begin, begin> cuintrange{};
-
-template <typename T, size_t size, T start = T(), ptrdiff_t step = 1>
-constexpr cvalseq_t<T, size, start, step> cvalseq{};
-
 template <size_t size, size_t start = 0, ptrdiff_t step = 1>
-constexpr cvalseq_t<size_t, size, start, step> csizeseq{};
-
-template <size_t size, int start = 0, ptrdiff_t step = 1>
-constexpr cvalseq_t<int, size, start, step> cintseq{};
-
-template <size_t size, unsigned start = 0, ptrdiff_t step = 1>
-constexpr cvalseq_t<unsigned, size, start, step> cuintseq{};
+using csizeseq_t = cvalseq_t<size_t, size, start, step>;
 
 template <typename... List>
 using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>;
 
-template <typename... List>
-constexpr indicesfor_t<List...> indicesfor{};
-
 namespace details
 {
 
@@ -775,21 +715,27 @@ struct is_enabled_impl<Fn, void_t<decltype(Fn::disabled)>> : std::integral_const
 {
 };
 
-template <size_t N>
+template <int N>
 struct unique_enum_impl
 {
-    enum class type : size_t
+    enum type : int
     {
         value = N
     };
 };
-template <size_t N>
-using unique_enum = typename unique_enum_impl<N>::type;
 
+#ifdef CMT_COMPILER_MSVC
+#define CMT_ENABLE_IF_IMPL(N, ...)                                                                           \
+    bool enable_ = (__VA_ARGS__), typename enabled_ = ::std::enable_if<enable_>::type,                       \
+         typename cometa::details::unique_enum_impl<N>::type dummy_ =                                        \
+             ::cometa::details::unique_enum_impl<N>::value
+
+#else
 #define CMT_ENABLE_IF_IMPL(N, ...)                                                                           \
-    typename ::std::enable_if<(__VA_ARGS__), ::cometa::details::unique_enum<N>>::type =                      \
-        ::cometa::details::unique_enum<N>::value
+    typename ::std::enable_if<(__VA_ARGS__), typename ::cometa::details::unique_enum_impl<N>::type>::type =  \
+        ::cometa::details::unique_enum_impl<N>::value
 
+#endif
 #define CMT_ENABLE_IF(...) CMT_ENABLE_IF_IMPL(__LINE__, __VA_ARGS__)
 }
 
@@ -1020,10 +966,16 @@ struct identity_impl
 };
 
 template <typename T>
-constexpr size_t elementsize = sizeof(T);
+constexpr size_t elementsize()
+{
+    return sizeof(T);
+}
 
 template <>
-constexpr size_t elementsize<void> = 1;
+constexpr size_t elementsize<void>()
+{
+    return 1;
+};
 }
 
 template <typename T>
@@ -1048,7 +1000,7 @@ struct carray<T, 1>
 
     template <typename Fn, size_t index = 0, CMT_ENABLE_IF(is_callable<Fn, csize_t<index>>::value)>
     CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept
-        : val(static_cast<T>(fn(csize<index>)))
+        : val(static_cast<T>(fn(csize_t<index>())))
     {
     }
 
@@ -1071,12 +1023,12 @@ struct carray<T, 1>
     template <size_t index>
     CMT_INTRIN constexpr T& get() noexcept
     {
-        return get(csize<index>);
+        return get(csize_t<index>());
     }
     template <size_t index>
     CMT_INTRIN constexpr const T& get() const noexcept
     {
-        return get(csize<index>);
+        return get(csize_t<index>());
     }
     CMT_INTRIN constexpr const T* front() const noexcept { return val; }
     CMT_INTRIN constexpr T* front() noexcept { return val; }
@@ -1103,8 +1055,8 @@ struct carray : carray<T, N - 1>
 
     template <typename Fn, size_t index = N - 1>
     CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept
-        : carray<T, N - 1>(std::forward<Fn>(fn), csize<index - 1>),
-          val(static_cast<T>(fn(csize<index>)))
+        : carray<T, N - 1>(std::forward<Fn>(fn), csize_t<index - 1>()),
+          val(static_cast<T>(fn(csize_t<index>())))
     {
     }
 
@@ -1116,23 +1068,23 @@ struct carray : carray<T, N - 1>
     template <size_t index>
     CMT_INTRIN constexpr T& get(csize_t<index>) noexcept
     {
-        return carray<T, N - 1>::get(csize<index>);
+        return carray<T, N - 1>::get(csize_t<index>());
     }
     CMT_INTRIN constexpr const T& get(csize_t<N - 1>) const noexcept { return val; }
     template <size_t index>
     CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept
     {
-        return carray<T, N - 1>::get(csize<index>);
+        return carray<T, N - 1>::get(csize_t<index>());
     }
     template <size_t index>
     CMT_INTRIN constexpr T& get() noexcept
     {
-        return get(csize<index>);
+        return get(csize_t<index>());
     }
     template <size_t index>
     CMT_INTRIN constexpr const T& get() const noexcept
     {
-        return get(csize<index>);
+        return get(csize_t<index>());
     }
     CMT_INTRIN constexpr const T* front() const noexcept { return carray<T, N - 1>::front(); }
     CMT_INTRIN constexpr T* front() noexcept { return carray<T, N - 1>::front(); }
@@ -1307,10 +1259,25 @@ using has_data_size = details::has_data_size_impl<decay<T>>;
 template <typename T>
 using value_type_of = typename decay<T>::value_type;
 
+#ifndef CMT_COMPILER_CLANG
+namespace details
+{
+template <typename T, T value, typename Fn>
+void cforeach_impl(Fn&& fn)
+{
+    fn(cval_t<T, value>());
+}
+}
+#endif
+
 template <typename T, T... values, typename Fn>
 CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn)
 {
-    swallow{ (fn(cval<T, values>), void(), 0)... };
+#ifdef CMT_COMPILER_CLANG
+    swallow{ (fn(cval_t<T, values>()), void(), 0)... };
+#else
+    swallow{ (details::cforeach_impl<T, values>(std::forward<Fn>(fn)), void(), 0)... };
+#endif
 }
 
 template <typename T, typename Fn, CMT_ENABLE_IF(has_begin_end<T>::value)>
@@ -1335,9 +1302,9 @@ namespace details
 {
 
 template <size_t index, typename... types>
-CMT_INTRIN auto get_type_arg(ctypes_t<types...> type_list)
+CMT_INTRIN auto get_type_arg(ctypes_t<types...>)
 {
-    return ctype<type_of<details::get_nth_type<index, types...>>>;
+    return ctype_t<type_of<details::get_nth_type<index, types...>>>();
 }
 
 template <typename T0, typename... types, typename Fn, size_t... indices>
@@ -1350,7 +1317,7 @@ CMT_INTRIN void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, c
 template <typename... Ts, typename Fn>
 CMT_INTRIN void cforeach(ctypes_t<Ts...> types, Fn&& fn)
 {
-    details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq<sizeof...(Ts)>);
+    details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq_t<sizeof...(Ts)>());
 }
 
 template <typename A0, typename A1, typename Fn>
@@ -1371,26 +1338,26 @@ CMT_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
 template <typename TrueFn, typename FalseFn = fn_noop>
 CMT_INTRIN decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn())
 {
-    return truefn(cbool<true>);
+    return truefn(ctrue);
 }
 
 template <typename TrueFn, typename FalseFn = fn_noop>
 CMT_INTRIN decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn())
 {
-    return falsefn(cbool<false>);
+    return falsefn(cfalse);
 }
 
 template <typename T, T start, T stop, typename BodyFn>
 CMT_INTRIN decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn)
 {
-    return cforeach(cvalrange<T, start, stop>, std::forward<BodyFn>(bodyfn));
+    return cforeach(cvalseq_t<T, stop - start, start>(), std::forward<BodyFn>(bodyfn));
 }
 
 template <typename T, T... vs, typename U, typename Function, typename Fallback = fn_noop>
 void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&& fallback = Fallback())
 {
     bool result = false;
-    swallow{ (result = result || ((vs == value) ? (function(cval<T, vs>), void(), true) : false), void(),
+    swallow{ (result = result || ((vs == value) ? (function(cval_t<T, vs>()), void(), true) : false), void(),
               0)... };
     if (!result)
         fallback();
@@ -1408,7 +1375,7 @@ CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, 
 {
     if (cmpfn(value, v0))
     {
-        return fn(cval<T, v0>);
+        return fn(cval_t<T, v0>());
     }
     else
     {
@@ -1442,7 +1409,7 @@ inline decltype(auto) cmatch_impl(T&& value, Fn1&& first, Fn2&& second, Fns&&...
 {
     using first_arg        = typename function_arguments<Fn1>::template nth<0>;
     constexpr bool is_same = std::is_same<decay<T>, decay<first_arg>>::value;
-    return cmatch_impl2(cbool<is_same>, std::forward<T>(value), std::forward<Fn1>(first),
+    return cmatch_impl2(cbool_t<is_same>(), std::forward<T>(value), std::forward<Fn1>(first),
                         std::forward<Fn2>(second), std::forward<Fns>(rest)...);
 }
 
@@ -1588,9 +1555,9 @@ constexpr inline ptrdiff_t distance(const void* x, const void* y)
     return static_cast<const unsigned char*>(x) - static_cast<const unsigned char*>(y);
 }
 
-#pragma GCC diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wundefined-reinterpret-cast")
-#pragma GCC diagnostic ignored "-Wundefined-reinterpret-cast"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wundefined-reinterpret-cast")
 #endif
 
 template <typename T, typename U>
@@ -1641,7 +1608,80 @@ CMT_INLINE constexpr static T implicit_cast(U&& value)
     return std::forward<T>(value);
 }
 
-#pragma GCC diagnostic pop
+#ifdef CMT_COMPILER_GNU
+
+template <typename T, T val>
+constexpr cval_t<T, val> cval{};
+
+template <bool val>
+constexpr cbool_t<val> cbool{};
+
+template <int val>
+constexpr cint_t<val> cint{};
+
+template <unsigned val>
+constexpr cuint_t<val> cuint{};
+
+template <size_t val>
+constexpr csize_t<val> csize{};
+
+template <typename T, T... values>
+constexpr cvals_t<T, values...> cvals{};
+
+template <bool... vals>
+constexpr cbools_t<vals...> cbools{};
+
+template <int... vals>
+constexpr cints_t<vals...> cints{};
+
+template <char... vals>
+constexpr cchars_t<vals...> cchars{};
+
+template <unsigned... vals>
+constexpr cuints_t<vals...> cuints{};
+
+template <size_t... vals>
+constexpr csizes_t<vals...> csizes{};
+
+template <size_t... vals>
+constexpr elements_t<vals...> elements{};
+
+template <typename T>
+constexpr ctype_t<T> ctype{};
+
+template <typename... Ts>
+constexpr ctypes_t<Ts...> ctypes{};
+
+template <typename T, T begin, T end>
+constexpr cvalseq_t<T, end - begin, begin> cvalrange{};
+
+template <size_t begin, size_t end>
+constexpr cvalseq_t<size_t, end - begin, begin> csizerange{};
+
+template <int begin, int end>
+constexpr cvalseq_t<int, end - begin, begin> cintrange{};
+
+template <unsigned begin, unsigned end>
+constexpr cvalseq_t<unsigned, end - begin, begin> cuintrange{};
+
+template <typename T, size_t size, T start = T(), ptrdiff_t step = 1>
+constexpr cvalseq_t<T, size, start, step> cvalseq{};
+
+template <size_t size, size_t start = 0, ptrdiff_t step = 1>
+constexpr cvalseq_t<size_t, size, start, step> csizeseq{};
+
+template <size_t size, int start = 0, ptrdiff_t step = 1>
+constexpr cvalseq_t<int, size, start, step> cintseq{};
+
+template <size_t size, unsigned start = 0, ptrdiff_t step = 1>
+constexpr cvalseq_t<unsigned, size, start, step> cuintseq{};
+template <typename... List>
+constexpr indicesfor_t<List...> indicesfor{};
+#endif
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 }
 
-#pragma GCC diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/cometa/array.hpp b/include/kfr/cometa/array.hpp
@@ -30,8 +30,13 @@ public:
     constexpr array_ref() noexcept : m_data(nullptr), m_size(0) {}
     constexpr array_ref(const array_ref&) noexcept = default;
     constexpr array_ref(array_ref&&) noexcept      = default;
+#ifdef CMT_COMPILER_GNU
     constexpr array_ref& operator=(const array_ref&) noexcept = default;
     constexpr array_ref& operator=(array_ref&&) noexcept = default;
+#else
+    array_ref& operator=(const array_ref&) = default;
+    array_ref& operator=(array_ref&&) = default;
+#endif
 
     template <size_t N>
     constexpr array_ref(value_type (&arr)[N]) noexcept : m_data(arr), m_size(N)
diff --git a/include/kfr/cometa/cstring.hpp b/include/kfr/cometa/cstring.hpp
@@ -31,13 +31,13 @@ struct cstring
     template <size_t start, size_t count>
     constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const noexcept
     {
-        return slice_impl(csizeseq<count, start>);
+        return slice_impl(csizeseq_t<count, start>());
     }
 
     template <size_t start>
     constexpr cstring<N - start> slice(csize_t<start>) const noexcept
     {
-        return slice_impl(csizeseq<N - 1 - start, start>);
+        return slice_impl(csizeseq_t<N - 1 - start, start>());
     }
 
     constexpr friend bool operator==(const cstring& left, const cstring& right) noexcept
@@ -85,14 +85,13 @@ CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<
                                                                   const cstring<N2>& str2,
                                                                   csizes_t<indices...>)
 {
-    constexpr size_t L1 = N1 - 1;
-    return { { (indices < L1 ? str1[indices] : str2[indices - L1])..., 0 } };
+    return { { (indices < (N1 - 1) ? str1[indices] : str2[indices - (N1 - 1)])..., 0 } };
 }
 template <size_t N1, size_t N2, typename... Args>
 CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<N1>& str1,
                                                                   const cstring<N2>& str2)
 {
-    return concat_str_impl(str1, str2, csizeseq<N1 - 1 + N2 - 1>);
+    return concat_str_impl(str1, str2, cvalseq_t<size_t, N1 - 1 + N2 - 1>());
 }
 template <size_t N1, size_t Nfrom, size_t Nto, size_t... indices>
 CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str,
@@ -125,7 +124,7 @@ CMT_INTRIN constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<
 template <size_t N>
 CMT_INTRIN constexpr cstring<N> make_cstring(const char (&str)[N])
 {
-    return details::make_cstring_impl(str, csizeseq<N - 1>);
+    return details::make_cstring_impl(str, cvalseq_t<size_t, N - 1>());
 }
 
 template <char... chars>
@@ -154,6 +153,7 @@ template <size_t N1, size_t Nfrom, size_t Nto>
 CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from,
                                                  const cstring<Nto>& to)
 {
-    return details::str_replace_impl(str_find(str, from), str, from, to, csizeseq<N1 - Nfrom + Nto - 1>);
+    return details::str_replace_impl(str_find(str, from), str, from, to,
+                                     cvalseq_t<size_t, N1 - Nfrom + Nto - 1>());
 }
 }
diff --git a/include/kfr/cometa/ctti.hpp b/include/kfr/cometa/ctti.hpp
@@ -45,9 +45,9 @@ constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) 
 template <typename T>
 constexpr auto ctype_name() noexcept
 {
-    constexpr size_t length =
-        sizeof(CMT_FUNC_SIGNATURE) - 1 - details::typename_prefix - details::typename_postfix;
-    return details::gettypename_impl(CMT_FUNC_SIGNATURE + details::typename_prefix, csizeseq<length>);
+    return details::gettypename_impl(CMT_FUNC_SIGNATURE + details::typename_prefix,
+                                     csizeseq_t<(sizeof(CMT_FUNC_SIGNATURE) - 1 - details::typename_prefix -
+                                                 details::typename_postfix)>());
 }
 
 /**
diff --git a/include/kfr/cometa/function.hpp b/include/kfr/cometa/function.hpp
@@ -143,7 +143,7 @@ inline function<Ret(Args...)> cdispatch(cvals_t<T, v0, values...>, identity<T> v
     if (value == v0)
     {
         return [=](Args... args)
-                   CMT_INLINE_MEMBER -> Ret { return fn(cval<T, v0>, std::forward<Args>(args)...); };
+                   CMT_INLINE_MEMBER -> Ret { return fn(cval_t<T, v0>(), std::forward<Args>(args)...); };
     }
     else
     {
diff --git a/include/kfr/cometa/named_arg.hpp b/include/kfr/cometa/named_arg.hpp
@@ -5,6 +5,9 @@
 
 #include "../cometa.hpp"
 
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4814))
+
 namespace cometa
 {
 template <typename T>
@@ -28,3 +31,5 @@ struct named
 
 inline named operator""_arg(const char* name, size_t) { return name; }
 }
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/cometa/string.hpp b/include/kfr/cometa/string.hpp
@@ -12,10 +12,10 @@
 #include <string>
 #include <utility>
 
-#pragma GCC diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wformat-security")
-#pragma GCC diagnostic ignored "-Wformat-security"
-#pragma GCC diagnostic ignored "-Wused-but-marked-unused"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wformat-security")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wused-but-marked-unused")
 #endif
 
 namespace cometa
@@ -24,11 +24,12 @@ namespace cometa
 template <typename T>
 struct representation
 {
+    using type = T;
     static constexpr const T& get(const T& value) noexcept { return value; }
 };
 
 template <typename T>
-using repr_type = decay<decltype(representation<T>::get(std::declval<T>()))>;
+using repr_type = typename representation<T>::type;
 
 template <typename... Args>
 CMT_INLINE std::string as_string(const Args&... args);
@@ -102,7 +103,7 @@ CMT_INLINE constexpr auto value_fmt(ctype_t<const void*>) { return make_cstring(
 template <char... chars>
 CMT_INLINE constexpr auto value_fmt(ctype_t<cchars_t<chars...>>)
 {
-    return concat_cstring(make_cstring("%s"), make_cstring(cchars<chars...>));
+    return concat_cstring(make_cstring("%s"), make_cstring(cchars_t<chars...>()));
 }
 
 template <typename T>
@@ -115,7 +116,7 @@ template <typename T, int width, int prec>
 CMT_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, static_cast<char>(-1), width, prec>> fmt)
 {
     return concat_cstring(make_cstring("%"), value_fmt_arg(fmt),
-                          value_fmt(ctype<repr_type<T>>).slice(csize<1>));
+                          value_fmt(ctype_t<repr_type<T>>()).slice(csize_t<1>()));
 }
 template <typename T, char t, int width, int prec>
 CMT_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, t, width, prec>> fmt)
@@ -188,7 +189,7 @@ CMT_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>& 
 template <size_t N1, size_t Nto>
 CMT_INLINE constexpr cstring<N1 - 3 + Nto> fmt_replace(const cstring<N1>& str, const cstring<Nto>& newfmt)
 {
-    return fmt_replace_impl(str, newfmt, csizeseq<N1 - 3 + Nto - 1>);
+    return fmt_replace_impl(str, newfmt, csizeseq_t<N1 - 3 + Nto - 1>());
 }
 
 inline std::string replace_one(const std::string& str, const std::string& from, const std::string& to)
@@ -207,8 +208,8 @@ CMT_INLINE const std::string& build_fmt(const std::string& str, ctypes_t<>) { re
 template <typename Arg, typename... Args>
 CMT_INLINE auto build_fmt(const std::string& str, ctypes_t<Arg, Args...>)
 {
-    constexpr auto fmt = value_fmt(ctype<decay<Arg>>);
-    return build_fmt(replace_one(str, "{}", std::string(fmt.data())), ctypes<Args...>);
+    constexpr auto fmt = value_fmt(ctype_t<decay<Arg>>());
+    return build_fmt(replace_one(str, "{}", std::string(fmt.data())), ctypes_t<Args...>());
 }
 }
 
@@ -224,22 +225,24 @@ CMT_INLINE details::fmt_t<T, static_cast<char>(-1), width, prec> fmtwidth(const 
     return { value };
 }
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wgnu-string-literal-operator-template"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wgnu-string-literal-operator-template")
 
 constexpr auto build_fmt_str(cchars_t<>, ctypes_t<>) { return make_cstring(""); }
 
 template <char... chars, typename Arg, typename... Args>
 constexpr auto build_fmt_str(cchars_t<'@', chars...>, ctypes_t<Arg, Args...>)
 {
-    return concat_cstring(details::value_fmt(ctype<decay<Arg>>),
-                          build_fmt_str(cchars<chars...>, ctypes<Args...>));
+    return concat_cstring(details::value_fmt(ctype_t<decay<Arg>>()),
+                          build_fmt_str(cchars_t<chars...>(), ctypes_t<Args...>()));
 }
 
 template <char ch, char... chars, typename... Args>
 constexpr auto build_fmt_str(cchars_t<ch, chars...>, ctypes_t<Args...>)
 {
-    return concat_cstring(make_cstring(cchars<ch>), build_fmt_str(cchars<chars...>, ctypes<Args...>));
+    return concat_cstring(make_cstring(cchars_t<ch>()),
+                          build_fmt_str(cchars_t<chars...>(), ctypes_t<Args...>()));
 }
 
 template <char... chars>
@@ -248,7 +251,7 @@ struct format_t
     template <typename... Args>
     inline std::string operator()(const Args&... args)
     {
-        constexpr auto format_str = build_fmt_str(cchars<chars...>, ctypes<repr_type<Args>...>);
+        constexpr auto format_str = build_fmt_str(cchars_t<chars...>(), ctypes_t<repr_type<Args>...>());
 
         std::string result;
         const int size = std::snprintf(nullptr, 0, format_str.data(), details::pack_value(args)...);
@@ -267,12 +270,14 @@ struct print_t
     template <typename... Args>
     CMT_INLINE void operator()(const Args&... args)
     {
-        constexpr auto format_str = build_fmt_str(cchars<chars...>, ctypes<repr_type<Args>...>);
+        constexpr auto format_str = build_fmt_str(cchars_t<chars...>(), ctypes_t<repr_type<Args>...>());
 
         std::printf(format_str.data(), details::pack_value(args)...);
     }
 };
 
+#ifdef CMT_COMPILER_GNU
+
 template <typename Char, Char... chars>
 constexpr format_t<chars...> operator""_format()
 {
@@ -285,26 +290,28 @@ constexpr CMT_INLINE print_t<chars...> operator""_print()
     return {};
 }
 
-#pragma GCC diagnostic pop
+#endif
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 
 template <typename... Args>
 CMT_INLINE void printfmt(const std::string& fmt, const Args&... args)
 {
-    const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
+    const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>());
     std::printf(format_str.data(), details::pack_value(representation<Args>::get(args))...);
 }
 
 template <typename... Args>
 CMT_INLINE void fprintfmt(FILE* f, const std::string& fmt, const Args&... args)
 {
-    const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
+    const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>());
     std::fprintf(f, format_str.data(), details::pack_value(representation<Args>::get(args))...);
 }
 
 template <typename... Args>
 CMT_INLINE int snprintfmt(char* str, size_t size, const std::string& fmt, const Args&... args)
 {
-    const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
+    const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>());
     return std::snprintf(str, size, format_str.data(),
                          details::pack_value(representation<Args>::get(args))...);
 }
@@ -313,7 +320,7 @@ template <typename... Args>
 CMT_INLINE std::string format(const std::string& fmt, const Args&... args)
 {
     std::string result;
-    const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
+    const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>());
     const int size =
         std::snprintf(nullptr, 0, format_str.data(), details::pack_value(representation<Args>::get(args))...);
     if (size <= 0)
@@ -329,22 +336,24 @@ namespace details
 template <typename T>
 constexpr auto get_value_fmt()
 {
-    return details::value_fmt(ctype<decay<repr_type<T>>>);
+    return details::value_fmt(ctype_t<decay<repr_type<T>>>());
 }
 }
 
 template <typename... Args>
 CMT_INLINE void print(const Args&... args)
 {
-    constexpr auto format_str = concat_cstring(details::get_value_fmt<Args>()...);
-    std::printf(format_str.data(), details::pack_value(representation<Args>::get(args))...);
+    constexpr const auto format_str = concat_cstring(details::get_value_fmt<Args>()...);
+    const char* str                 = format_str.data();
+    std::printf(str, details::pack_value(representation<Args>::get(args))...);
 }
 
 template <typename... Args>
 CMT_INLINE void println(const Args&... args)
 {
-    constexpr auto format_str = concat_cstring(details::get_value_fmt<Args>()..., make_cstring("\n"));
-    std::printf(format_str.data(), details::pack_value(representation<Args>::get(args))...);
+    constexpr const auto format_str = concat_cstring(details::get_value_fmt<Args>()..., make_cstring("\n"));
+    const char* str                 = format_str.data();
+    std::printf(str, details::pack_value(representation<Args>::get(args))...);
 }
 
 template <typename... Args>
@@ -352,13 +361,13 @@ CMT_INLINE std::string as_string(const Args&... args)
 {
     std::string result;
     constexpr auto format_str = concat_cstring(details::get_value_fmt<Args>()...);
+    const char* str           = format_str.data();
 
-    const int size =
-        std::snprintf(nullptr, 0, format_str.data(), details::pack_value(representation<Args>::get(args))...);
+    const int size = std::snprintf(nullptr, 0, str, details::pack_value(representation<Args>::get(args))...);
     if (size <= 0)
         return result;
     result.resize(size_t(size + 1));
-    result.resize(size_t(std::snprintf(&result[0], size_t(size + 1), format_str.data(),
+    result.resize(size_t(std::snprintf(&result[0], size_t(size + 1), str,
                                        details::pack_value(representation<Args>::get(args))...)));
     return result;
 }
@@ -402,6 +411,7 @@ inline std::string join(T x, U y, Ts... rest)
 template <typename T>
 struct representation<named_arg<T>>
 {
+    using type = std::string;
     static std::string get(const named_arg<T>& value)
     {
         return std::string(value.name) + " = " + as_string(value.value);
@@ -411,6 +421,7 @@ struct representation<named_arg<T>>
 template <typename T1, typename T2>
 struct representation<std::pair<T1, T2>>
 {
+    using type = std::string;
     static std::string get(const std::pair<T1, T2>& value)
     {
         return "(" + as_string(value.first) + "; " + as_string(value.second) + ")";
@@ -418,4 +429,4 @@ struct representation<std::pair<T1, T2>>
 };
 }
 
-#pragma GCC diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/data/sincos.hpp b/include/kfr/data/sincos.hpp
@@ -32,82 +32,160 @@ namespace data
 {
 
 // data generated by mpfr
-template <typename T>
-constexpr T c_sin_table[64] = {
-    /* sin(2*pi*   0/ 256) */ T(0.0),
-    /* sin(2*pi*   1/ 256) */ T(0.02454122852291228803173452945928292506547),
-    /* sin(2*pi*   2/ 256) */ T(0.04906767432741801425495497694268265831475),
-    /* sin(2*pi*   3/ 256) */ T(0.0735645635996674235294656215752343218133),
-    /* sin(2*pi*   4/ 256) */ T(0.09801714032956060199419556388864184586114),
-    /* sin(2*pi*   5/ 256) */ T(0.1224106751992161984987044741509457875752),
-    /* sin(2*pi*   6/ 256) */ T(0.1467304744553617516588501296467178197062),
-    /* sin(2*pi*   7/ 256) */ T(0.1709618887603012263636423572082635319663),
-    /* sin(2*pi*   8/ 256) */ T(0.1950903220161282678482848684770222409277),
-    /* sin(2*pi*   9/ 256) */ T(0.2191012401568697972277375474973577988484),
-    /* sin(2*pi*  10/ 256) */ T(0.242980179903263889948274162077471118321),
-    /* sin(2*pi*  11/ 256) */ T(0.2667127574748983863252865151164363940421),
-    /* sin(2*pi*  12/ 256) */ T(0.2902846772544623676361923758173952746915),
-    /* sin(2*pi*  13/ 256) */ T(0.3136817403988914766564788459941003099934),
-    /* sin(2*pi*  14/ 256) */ T(0.3368898533922200506892532126191475704778),
-    /* sin(2*pi*  15/ 256) */ T(0.3598950365349881487751045723267564202023),
-    /* sin(2*pi*  16/ 256) */ T(0.3826834323650897717284599840303988667613),
-    /* sin(2*pi*  17/ 256) */ T(0.4052413140049898709084813055050524665119),
-    /* sin(2*pi*  18/ 256) */ T(0.4275550934302820943209668568887985343046),
-    /* sin(2*pi*  19/ 256) */ T(0.4496113296546066000462945794242270758832),
-    /* sin(2*pi*  20/ 256) */ T(0.4713967368259976485563876259052543776575),
-    /* sin(2*pi*  21/ 256) */ T(0.4928981922297840368730266887588092682397),
-    /* sin(2*pi*  22/ 256) */ T(0.514102744193221726593693838968815772608),
-    /* sin(2*pi*  23/ 256) */ T(0.5349976198870972106630769046370179155603),
-    /* sin(2*pi*  24/ 256) */ T(0.5555702330196022247428308139485328743749),
-    /* sin(2*pi*  25/ 256) */ T(0.575808191417845300745972453815730841776),
-    /* sin(2*pi*  26/ 256) */ T(0.5956993044924333434670365288299698895119),
-    /* sin(2*pi*  27/ 256) */ T(0.6152315905806268454849135634139842776594),
-    /* sin(2*pi*  28/ 256) */ T(0.6343932841636454982151716132254933706757),
-    /* sin(2*pi*  29/ 256) */ T(0.6531728429537767640842030136563054150769),
-    /* sin(2*pi*  30/ 256) */ T(0.6715589548470184006253768504274218032288),
-    /* sin(2*pi*  31/ 256) */ T(0.6895405447370669246167306299574847028455),
-    /* sin(2*pi*  32/ 256) */ T(0.7071067811865475244008443621048490392848),
-    /* sin(2*pi*  33/ 256) */ T(0.7242470829514669209410692432905531674831),
-    /* sin(2*pi*  34/ 256) */ T(0.740951125354959091175616897495162729729),
-    /* sin(2*pi*  35/ 256) */ T(0.7572088465064845475754640536057844730404),
-    /* sin(2*pi*  36/ 256) */ T(0.773010453362736960810906609758469800971),
-    /* sin(2*pi*  37/ 256) */ T(0.7883464276266062620091647053596892826565),
-    /* sin(2*pi*  38/ 256) */ T(0.8032075314806449098066765129631419238796),
-    /* sin(2*pi*  39/ 256) */ T(0.817584813151583696504920884130633809471),
-    /* sin(2*pi*  40/ 256) */ T(0.8314696123025452370787883776179057567386),
-    /* sin(2*pi*  41/ 256) */ T(0.8448535652497070732595712051049570977198),
-    /* sin(2*pi*  42/ 256) */ T(0.8577286100002720699022699842847701370425),
-    /* sin(2*pi*  43/ 256) */ T(0.8700869911087114186522924044838488439108),
-    /* sin(2*pi*  44/ 256) */ T(0.8819212643483550297127568636603883495084),
-    /* sin(2*pi*  45/ 256) */ T(0.8932243011955153203424164474933979780006),
-    /* sin(2*pi*  46/ 256) */ T(0.9039892931234433315862002972305370487101),
-    /* sin(2*pi*  47/ 256) */ T(0.9142097557035306546350148293935774010447),
-    /* sin(2*pi*  48/ 256) */ T(0.9238795325112867561281831893967882868224),
-    /* sin(2*pi*  49/ 256) */ T(0.932992798834738887711660255543302498295),
-    /* sin(2*pi*  50/ 256) */ T(0.9415440651830207784125094025995023571856),
-    /* sin(2*pi*  51/ 256) */ T(0.9495281805930366671959360741893450282522),
-    /* sin(2*pi*  52/ 256) */ T(0.9569403357322088649357978869802699694828),
-    /* sin(2*pi*  53/ 256) */ T(0.9637760657954398666864643555078351536631),
-    /* sin(2*pi*  54/ 256) */ T(0.9700312531945439926039842072861002514569),
-    /* sin(2*pi*  55/ 256) */ T(0.975702130038528544460395766419527971644),
-    /* sin(2*pi*  56/ 256) */ T(0.9807852804032304491261822361342390369739),
-    /* sin(2*pi*  57/ 256) */ T(0.9852776423889412447740184331785477871601),
-    /* sin(2*pi*  58/ 256) */ T(0.9891765099647809734516737380162430639837),
-    /* sin(2*pi*  59/ 256) */ T(0.9924795345987099981567672516611178200108),
-    /* sin(2*pi*  60/ 256) */ T(0.9951847266721968862448369531094799215755),
-    /* sin(2*pi*  61/ 256) */ T(0.9972904566786902161355971401825678211717),
-    /* sin(2*pi*  62/ 256) */ T(0.9987954562051723927147716047591006944432),
-    /* sin(2*pi*  63/ 256) */ T(0.9996988186962042201157656496661721968501)
+constexpr f32 c_sin_table_f32[64] = {
+    /* sin(2*pi*   0/ 256) */ f32(0.0),
+    /* sin(2*pi*   1/ 256) */ f32(0.02454122852291228803173452945928292506547),
+    /* sin(2*pi*   2/ 256) */ f32(0.04906767432741801425495497694268265831475),
+    /* sin(2*pi*   3/ 256) */ f32(0.0735645635996674235294656215752343218133),
+    /* sin(2*pi*   4/ 256) */ f32(0.09801714032956060199419556388864184586114),
+    /* sin(2*pi*   5/ 256) */ f32(0.1224106751992161984987044741509457875752),
+    /* sin(2*pi*   6/ 256) */ f32(0.1467304744553617516588501296467178197062),
+    /* sin(2*pi*   7/ 256) */ f32(0.1709618887603012263636423572082635319663),
+    /* sin(2*pi*   8/ 256) */ f32(0.1950903220161282678482848684770222409277),
+    /* sin(2*pi*   9/ 256) */ f32(0.2191012401568697972277375474973577988484),
+    /* sin(2*pi*  10/ 256) */ f32(0.242980179903263889948274162077471118321),
+    /* sin(2*pi*  11/ 256) */ f32(0.2667127574748983863252865151164363940421),
+    /* sin(2*pi*  12/ 256) */ f32(0.2902846772544623676361923758173952746915),
+    /* sin(2*pi*  13/ 256) */ f32(0.3136817403988914766564788459941003099934),
+    /* sin(2*pi*  14/ 256) */ f32(0.3368898533922200506892532126191475704778),
+    /* sin(2*pi*  15/ 256) */ f32(0.3598950365349881487751045723267564202023),
+    /* sin(2*pi*  16/ 256) */ f32(0.3826834323650897717284599840303988667613),
+    /* sin(2*pi*  17/ 256) */ f32(0.4052413140049898709084813055050524665119),
+    /* sin(2*pi*  18/ 256) */ f32(0.4275550934302820943209668568887985343046),
+    /* sin(2*pi*  19/ 256) */ f32(0.4496113296546066000462945794242270758832),
+    /* sin(2*pi*  20/ 256) */ f32(0.4713967368259976485563876259052543776575),
+    /* sin(2*pi*  21/ 256) */ f32(0.4928981922297840368730266887588092682397),
+    /* sin(2*pi*  22/ 256) */ f32(0.514102744193221726593693838968815772608),
+    /* sin(2*pi*  23/ 256) */ f32(0.5349976198870972106630769046370179155603),
+    /* sin(2*pi*  24/ 256) */ f32(0.5555702330196022247428308139485328743749),
+    /* sin(2*pi*  25/ 256) */ f32(0.575808191417845300745972453815730841776),
+    /* sin(2*pi*  26/ 256) */ f32(0.5956993044924333434670365288299698895119),
+    /* sin(2*pi*  27/ 256) */ f32(0.6152315905806268454849135634139842776594),
+    /* sin(2*pi*  28/ 256) */ f32(0.6343932841636454982151716132254933706757),
+    /* sin(2*pi*  29/ 256) */ f32(0.6531728429537767640842030136563054150769),
+    /* sin(2*pi*  30/ 256) */ f32(0.6715589548470184006253768504274218032288),
+    /* sin(2*pi*  31/ 256) */ f32(0.6895405447370669246167306299574847028455),
+    /* sin(2*pi*  32/ 256) */ f32(0.7071067811865475244008443621048490392848),
+    /* sin(2*pi*  33/ 256) */ f32(0.7242470829514669209410692432905531674831),
+    /* sin(2*pi*  34/ 256) */ f32(0.740951125354959091175616897495162729729),
+    /* sin(2*pi*  35/ 256) */ f32(0.7572088465064845475754640536057844730404),
+    /* sin(2*pi*  36/ 256) */ f32(0.773010453362736960810906609758469800971),
+    /* sin(2*pi*  37/ 256) */ f32(0.7883464276266062620091647053596892826565),
+    /* sin(2*pi*  38/ 256) */ f32(0.8032075314806449098066765129631419238796),
+    /* sin(2*pi*  39/ 256) */ f32(0.817584813151583696504920884130633809471),
+    /* sin(2*pi*  40/ 256) */ f32(0.8314696123025452370787883776179057567386),
+    /* sin(2*pi*  41/ 256) */ f32(0.8448535652497070732595712051049570977198),
+    /* sin(2*pi*  42/ 256) */ f32(0.8577286100002720699022699842847701370425),
+    /* sin(2*pi*  43/ 256) */ f32(0.8700869911087114186522924044838488439108),
+    /* sin(2*pi*  44/ 256) */ f32(0.8819212643483550297127568636603883495084),
+    /* sin(2*pi*  45/ 256) */ f32(0.8932243011955153203424164474933979780006),
+    /* sin(2*pi*  46/ 256) */ f32(0.9039892931234433315862002972305370487101),
+    /* sin(2*pi*  47/ 256) */ f32(0.9142097557035306546350148293935774010447),
+    /* sin(2*pi*  48/ 256) */ f32(0.9238795325112867561281831893967882868224),
+    /* sin(2*pi*  49/ 256) */ f32(0.932992798834738887711660255543302498295),
+    /* sin(2*pi*  50/ 256) */ f32(0.9415440651830207784125094025995023571856),
+    /* sin(2*pi*  51/ 256) */ f32(0.9495281805930366671959360741893450282522),
+    /* sin(2*pi*  52/ 256) */ f32(0.9569403357322088649357978869802699694828),
+    /* sin(2*pi*  53/ 256) */ f32(0.9637760657954398666864643555078351536631),
+    /* sin(2*pi*  54/ 256) */ f32(0.9700312531945439926039842072861002514569),
+    /* sin(2*pi*  55/ 256) */ f32(0.975702130038528544460395766419527971644),
+    /* sin(2*pi*  56/ 256) */ f32(0.9807852804032304491261822361342390369739),
+    /* sin(2*pi*  57/ 256) */ f32(0.9852776423889412447740184331785477871601),
+    /* sin(2*pi*  58/ 256) */ f32(0.9891765099647809734516737380162430639837),
+    /* sin(2*pi*  59/ 256) */ f32(0.9924795345987099981567672516611178200108),
+    /* sin(2*pi*  60/ 256) */ f32(0.9951847266721968862448369531094799215755),
+    /* sin(2*pi*  61/ 256) */ f32(0.9972904566786902161355971401825678211717),
+    /* sin(2*pi*  62/ 256) */ f32(0.9987954562051723927147716047591006944432),
+    /* sin(2*pi*  63/ 256) */ f32(0.9996988186962042201157656496661721968501)
+};
+
+// data generated by mpfr
+constexpr f64 c_sin_table_f64[64] = {
+    /* sin(2*pi*   0/ 256) */ f64(0.0),
+    /* sin(2*pi*   1/ 256) */ f64(0.02454122852291228803173452945928292506547),
+    /* sin(2*pi*   2/ 256) */ f64(0.04906767432741801425495497694268265831475),
+    /* sin(2*pi*   3/ 256) */ f64(0.0735645635996674235294656215752343218133),
+    /* sin(2*pi*   4/ 256) */ f64(0.09801714032956060199419556388864184586114),
+    /* sin(2*pi*   5/ 256) */ f64(0.1224106751992161984987044741509457875752),
+    /* sin(2*pi*   6/ 256) */ f64(0.1467304744553617516588501296467178197062),
+    /* sin(2*pi*   7/ 256) */ f64(0.1709618887603012263636423572082635319663),
+    /* sin(2*pi*   8/ 256) */ f64(0.1950903220161282678482848684770222409277),
+    /* sin(2*pi*   9/ 256) */ f64(0.2191012401568697972277375474973577988484),
+    /* sin(2*pi*  10/ 256) */ f64(0.242980179903263889948274162077471118321),
+    /* sin(2*pi*  11/ 256) */ f64(0.2667127574748983863252865151164363940421),
+    /* sin(2*pi*  12/ 256) */ f64(0.2902846772544623676361923758173952746915),
+    /* sin(2*pi*  13/ 256) */ f64(0.3136817403988914766564788459941003099934),
+    /* sin(2*pi*  14/ 256) */ f64(0.3368898533922200506892532126191475704778),
+    /* sin(2*pi*  15/ 256) */ f64(0.3598950365349881487751045723267564202023),
+    /* sin(2*pi*  16/ 256) */ f64(0.3826834323650897717284599840303988667613),
+    /* sin(2*pi*  17/ 256) */ f64(0.4052413140049898709084813055050524665119),
+    /* sin(2*pi*  18/ 256) */ f64(0.4275550934302820943209668568887985343046),
+    /* sin(2*pi*  19/ 256) */ f64(0.4496113296546066000462945794242270758832),
+    /* sin(2*pi*  20/ 256) */ f64(0.4713967368259976485563876259052543776575),
+    /* sin(2*pi*  21/ 256) */ f64(0.4928981922297840368730266887588092682397),
+    /* sin(2*pi*  22/ 256) */ f64(0.514102744193221726593693838968815772608),
+    /* sin(2*pi*  23/ 256) */ f64(0.5349976198870972106630769046370179155603),
+    /* sin(2*pi*  24/ 256) */ f64(0.5555702330196022247428308139485328743749),
+    /* sin(2*pi*  25/ 256) */ f64(0.575808191417845300745972453815730841776),
+    /* sin(2*pi*  26/ 256) */ f64(0.5956993044924333434670365288299698895119),
+    /* sin(2*pi*  27/ 256) */ f64(0.6152315905806268454849135634139842776594),
+    /* sin(2*pi*  28/ 256) */ f64(0.6343932841636454982151716132254933706757),
+    /* sin(2*pi*  29/ 256) */ f64(0.6531728429537767640842030136563054150769),
+    /* sin(2*pi*  30/ 256) */ f64(0.6715589548470184006253768504274218032288),
+    /* sin(2*pi*  31/ 256) */ f64(0.6895405447370669246167306299574847028455),
+    /* sin(2*pi*  32/ 256) */ f64(0.7071067811865475244008443621048490392848),
+    /* sin(2*pi*  33/ 256) */ f64(0.7242470829514669209410692432905531674831),
+    /* sin(2*pi*  34/ 256) */ f64(0.740951125354959091175616897495162729729),
+    /* sin(2*pi*  35/ 256) */ f64(0.7572088465064845475754640536057844730404),
+    /* sin(2*pi*  36/ 256) */ f64(0.773010453362736960810906609758469800971),
+    /* sin(2*pi*  37/ 256) */ f64(0.7883464276266062620091647053596892826565),
+    /* sin(2*pi*  38/ 256) */ f64(0.8032075314806449098066765129631419238796),
+    /* sin(2*pi*  39/ 256) */ f64(0.817584813151583696504920884130633809471),
+    /* sin(2*pi*  40/ 256) */ f64(0.8314696123025452370787883776179057567386),
+    /* sin(2*pi*  41/ 256) */ f64(0.8448535652497070732595712051049570977198),
+    /* sin(2*pi*  42/ 256) */ f64(0.8577286100002720699022699842847701370425),
+    /* sin(2*pi*  43/ 256) */ f64(0.8700869911087114186522924044838488439108),
+    /* sin(2*pi*  44/ 256) */ f64(0.8819212643483550297127568636603883495084),
+    /* sin(2*pi*  45/ 256) */ f64(0.8932243011955153203424164474933979780006),
+    /* sin(2*pi*  46/ 256) */ f64(0.9039892931234433315862002972305370487101),
+    /* sin(2*pi*  47/ 256) */ f64(0.9142097557035306546350148293935774010447),
+    /* sin(2*pi*  48/ 256) */ f64(0.9238795325112867561281831893967882868224),
+    /* sin(2*pi*  49/ 256) */ f64(0.932992798834738887711660255543302498295),
+    /* sin(2*pi*  50/ 256) */ f64(0.9415440651830207784125094025995023571856),
+    /* sin(2*pi*  51/ 256) */ f64(0.9495281805930366671959360741893450282522),
+    /* sin(2*pi*  52/ 256) */ f64(0.9569403357322088649357978869802699694828),
+    /* sin(2*pi*  53/ 256) */ f64(0.9637760657954398666864643555078351536631),
+    /* sin(2*pi*  54/ 256) */ f64(0.9700312531945439926039842072861002514569),
+    /* sin(2*pi*  55/ 256) */ f64(0.975702130038528544460395766419527971644),
+    /* sin(2*pi*  56/ 256) */ f64(0.9807852804032304491261822361342390369739),
+    /* sin(2*pi*  57/ 256) */ f64(0.9852776423889412447740184331785477871601),
+    /* sin(2*pi*  58/ 256) */ f64(0.9891765099647809734516737380162430639837),
+    /* sin(2*pi*  59/ 256) */ f64(0.9924795345987099981567672516611178200108),
+    /* sin(2*pi*  60/ 256) */ f64(0.9951847266721968862448369531094799215755),
+    /* sin(2*pi*  61/ 256) */ f64(0.9972904566786902161355971401825678211717),
+    /* sin(2*pi*  62/ 256) */ f64(0.9987954562051723927147716047591006944432),
+    /* sin(2*pi*  63/ 256) */ f64(0.9996988186962042201157656496661721968501)
 };
 }
 
 template <typename T>
-constexpr inline T sin_using_table_256(size_t k)
+constexpr inline T sin_using_table_256(size_t k);
+
+template <>
+constexpr inline f32 sin_using_table_256<f32>(size_t k)
+{
+    return (k == 0 || k == 128) ? 0.f : (k == 64) ? 1.f : (k > 128)
+                                                              ? -sin_using_table_256<f32>(k - 128)
+                                                              : (k > 64) ? sin_using_table_256<f32>(128 - k)
+                                                                         : data::c_sin_table_f32[k];
+}
+template <>
+constexpr inline f64 sin_using_table_256<f64>(size_t k)
 {
-    return (k == 0 || k == 128) ? T(0) : (k == 64) ? T(1) : (k > 128)
-                                                                ? -sin_using_table_256<T>(k - 128)
-                                                                : (k > 64) ? sin_using_table_256<T>(128 - k)
-                                                                           : data::c_sin_table<T>[k];
+    return (k == 0 || k == 128) ? 0.0 : (k == 64) ? 1.0 : (k > 128)
+                                                              ? -sin_using_table_256<f64>(k - 128)
+                                                              : (k > 64) ? sin_using_table_256<f64>(128 - k)
+                                                                         : data::c_sin_table_f64[k];
 }
 
 template <typename T>
diff --git a/include/kfr/dft/bitrev.hpp b/include/kfr/dft/bitrev.hpp
@@ -43,7 +43,7 @@ namespace internal
 constexpr bool fft_reorder_aligned = false;
 
 template <size_t Bits>
-constexpr inline u32 bitrev_using_table(u32 x)
+CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x)
 {
     constexpr size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table));
     if (Bits > bitrev_table_log2N)
@@ -52,7 +52,7 @@ constexpr inline u32 bitrev_using_table(u32 x)
     return data::bitrev_table[x] >> (bitrev_table_log2N - Bits);
 }
 
-constexpr inline u32 bitrev_using_table(u32 x, size_t bits)
+CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits)
 {
     constexpr size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table));
     if (bits > bitrev_table_log2N)
@@ -61,7 +61,7 @@ constexpr inline u32 bitrev_using_table(u32 x, size_t bits)
     return data::bitrev_table[x] >> (bitrev_table_log2N - bits);
 }
 
-constexpr inline u32 dig4rev_using_table(u32 x, size_t bits)
+CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits)
 {
     constexpr size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table));
     if (bits > bitrev_table_log2N)
@@ -253,10 +253,10 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>)
 }
 
 template <typename T, bool use_br2>
-void cwrite_reordered(T* out, cvec<T, 16> value, size_t N4, cbool_t<use_br2>)
+void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
 {
-    value = digitreverse < use_br2 ? 2 : 4, 2 > (value);
-    cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4, value);
+    cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4,
+                                            digitreverse<(use_br2 ? 2 : 4), 2>(value));
 }
 
 template <typename T, bool use_br2>
@@ -265,8 +265,8 @@ KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbo
     CMT_ASSUME(i != j);
     const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4);
     const cvec<T, 16> vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4);
-    cwrite_reordered(inout + j, vi, N4, cbool<use_br2>);
-    cwrite_reordered(inout + i, vj, N4, cbool<use_br2>);
+    cwrite_reordered(inout + j, vi, N4, cbool_t<use_br2>());
+    cwrite_reordered(inout + i, vj, N4, cbool_t<use_br2>());
 }
 
 template <typename T>
@@ -316,7 +316,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
     T* io                  = ptr_cast<T>(inout);
 
     size_t i = 0;
-#pragma clang loop unroll_count(2)
+    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
     for (; i < iend;)
     {
         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
@@ -326,7 +326,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
         i += istep * 4;
     }
     iend += N16;
-#pragma clang loop unroll_count(2)
+    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
     for (; i < iend;)
     {
         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
@@ -341,7 +341,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
         i += istep * 3;
     }
     iend += N16;
-#pragma clang loop unroll_count(2)
+    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
     for (; i < iend;)
     {
         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
@@ -361,7 +361,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
         i += istep * 2;
     }
     iend += N16;
-#pragma clang loop unroll_count(2)
+    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
     for (; i < iend;)
     {
         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
diff --git a/include/kfr/dft/cache.hpp b/include/kfr/dft/cache.hpp
@@ -39,11 +39,12 @@ using dft_plan_ptr = std::shared_ptr<const dft_plan<T>>;
 template <typename T>
 using dft_plan_real_ptr = std::shared_ptr<const dft_plan_real<T>>;
 
-struct dft_cache
+template <int = 0>
+struct dft_cache_impl
 {
-    static dft_cache& instance()
+    static dft_cache_impl& instance()
     {
-        static dft_cache cache;
+        static dft_cache_impl cache;
         return cache;
     }
     dft_plan_ptr<f32> get(ctype_t<f32>, size_t size)
@@ -120,10 +121,12 @@ private:
 #endif
 };
 
+using dft_cache = dft_cache_impl<>;
+
 template <typename T, size_t Tag>
 univector<complex<T>> dft(const univector<complex<T>, Tag>& input)
 {
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, input.size());
+    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), input.size());
     univector<complex<T>> output(input.size());
     univector<u8> temp(dft->temp_size);
     dft->execute(output, input, temp);
@@ -133,7 +136,7 @@ univector<complex<T>> dft(const univector<complex<T>, Tag>& input)
 template <typename T, size_t Tag>
 univector<complex<T>> idft(const univector<complex<T>, Tag>& input)
 {
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, input.size());
+    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), input.size());
     univector<complex<T>> output(input.size());
     univector<u8> temp(dft->temp_size);
     dft->execute(output, input, temp, ctrue);
@@ -143,7 +146,7 @@ univector<complex<T>> idft(const univector<complex<T>, Tag>& input)
 template <typename T, size_t Tag>
 univector<complex<T>> realdft(const univector<T, Tag>& input)
 {
-    dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype<T>, input.size());
+    dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype_t<T>(), input.size());
     univector<complex<T>> output(input.size() / 2 + 1);
     univector<u8> temp(dft->temp_size);
     dft->execute(output, input, temp);
@@ -153,7 +156,7 @@ univector<complex<T>> realdft(const univector<T, Tag>& input)
 template <typename T, size_t Tag>
 univector<T> irealdft(const univector<complex<T>, Tag>& input)
 {
-    dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype<T>, input.size());
+    dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype_t<T>(), input.size());
     univector<T> output((input.size() - 1) * 2);
     univector<u8> temp(dft->temp_size);
     dft->execute(output, input, temp);
diff --git a/include/kfr/dft/conv.hpp b/include/kfr/dft/conv.hpp
@@ -34,9 +34,9 @@
 #include "cache.hpp"
 #include "fft.hpp"
 
-#pragma clang diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wshadow")
-#pragma clang diagnostic ignored "-Wshadow"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 #endif
 
 namespace kfr
@@ -51,7 +51,7 @@ KFR_INTRIN univector<T> convolve(const univector<T, Tag1>& src1, const univector
     src1padded.resize(size, 0);
     src2padded.resize(size, 0);
 
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, size);
+    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
     univector<u8> temp(dft->temp_size);
     dft->execute(src1padded, src1padded, temp);
     dft->execute(src2padded, src2padded, temp);
@@ -69,7 +69,7 @@ KFR_INTRIN univector<T> correlate(const univector<T, Tag1>& src1, const univecto
     univector<complex<T>> src2padded = reverse(src2);
     src1padded.resize(size, 0);
     src2padded.resize(size, 0);
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, size);
+    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
     univector<u8> temp(dft->temp_size);
     dft->execute(src1padded, src1padded, temp);
     dft->execute(src2padded, src2padded, temp);
@@ -87,4 +87,4 @@ KFR_INTRIN univector<T> autocorrelate(const univector<T, Tag1>& src)
     return result;
 }
 }
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -35,11 +35,14 @@
 #include "bitrev.hpp"
 #include "ft.hpp"
 
-#pragma clang diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wshadow")
-#pragma clang diagnostic ignored "-Wshadow"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 #endif
 
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4100))
+
 namespace kfr
 {
 
@@ -65,9 +68,9 @@ protected:
     virtual void do_execute(complex<T>*, const complex<T>*, u8* temp) = 0;
 };
 
-#pragma clang diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wassume")
-#pragma clang diagnostic ignored "-Wassume"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
 #endif
 
 namespace internal
@@ -75,15 +78,17 @@ namespace internal
 
 template <size_t width, bool inverse, typename T>
 KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>,
-                                                cvec<T, width> w, cvec<T, width> tw)
+                                                const cvec<T, width>& w, const cvec<T, width>& tw)
 {
-    cvec<T, width> b1 = w * dupeven(tw);
-    w = swap<2>(w);
+    cvec<T, width> ww  = w;
+    cvec<T, width> tw_ = tw;
+    cvec<T, width> b1  = ww * dupeven(tw_);
+    ww = swap<2>(ww);
 
     if (inverse)
-        tw = -(tw);
-    w      = subadd(b1, w * dupodd(tw));
-    return w;
+        tw_ = -(tw_);
+    ww      = subadd(b1, ww * dupodd(tw_));
+    return ww;
 }
 
 template <size_t width, bool use_br2, bool inverse, bool aligned, typename T>
@@ -107,9 +112,9 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfals
 
     cwrite<width, aligned>(out, sum02 + sum13);
     w2 = sum02 - sum13;
-    cwrite<width, aligned>(
-        out + N4 * (use_br2 ? 1 : 2),
-        radix4_apply_twiddle(csize<width>, cfalse, cbool<inverse>, w2, cread<width, true>(twiddle + width)));
+    cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2),
+                           radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2,
+                                                cread<width, true>(twiddle + width)));
     diff02 = a0 - a2;
     diff13 = a1 - a3;
     if (inverse)
@@ -125,17 +130,17 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfals
 
     w1 = diff02 + diff13;
 
-    cwrite<width, aligned>(
-        out + N4 * (use_br2 ? 2 : 1),
-        radix4_apply_twiddle(csize<width>, cfalse, cbool<inverse>, w1, cread<width, true>(twiddle + 0)));
+    cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1),
+                           radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1,
+                                                cread<width, true>(twiddle + 0)));
     w3 = diff02 - diff13;
-    cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize<width>, cfalse, cbool<inverse>, w3,
-                                                              cread<width, true>(twiddle + width * 2)));
+    cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(),
+                                                              w3, cread<width, true>(twiddle + width * 2)));
 }
 
 template <size_t width, bool inverse, typename T>
 KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>,
-                                                cvec<T, width> w, cvec<T, width> tw)
+                                                const cvec<T, width>& w, const cvec<T, width>& tw)
 {
     vec<T, width> re1, im1, twre, twim;
     split(w, re1, im1);
@@ -144,10 +149,9 @@ KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_
     const vec<T, width> b1re = re1 * twre;
     const vec<T, width> b1im = im1 * twre;
     if (inverse)
-        w = concat(b1re + im1 * twim, b1im - re1 * twim);
+        return concat(b1re + im1 * twim, b1im - re1 * twim);
     else
-        w = concat(b1re - im1 * twim, b1im + re1 * twim);
-    return w;
+        return concat(b1re - im1 * twim, b1im + re1 * twim);
 }
 
 template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T>
@@ -175,8 +179,8 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout
     cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im));
     w2 = concat(sum02re - sum13re, sum02im - sum13im);
     cwrite_split<width, aligned, write_split>(
-        out + N4 * (use_br2 ? 1 : 2),
-        radix4_apply_twiddle(csize<width>, ctrue, cbool<inverse>, w2, cread<width, true>(twiddle + width)));
+        out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2,
+                                                           cread<width, true>(twiddle + width)));
 
     const vec<T, width> diff02re = re0 - re2;
     const vec<T, width> diff02im = im0 - im2;
@@ -187,11 +191,11 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout
     (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
 
     cwrite_split<width, aligned, write_split>(
-        out + N4 * (use_br2 ? 2 : 1),
-        radix4_apply_twiddle(csize<width>, ctrue, cbool<inverse>, w1, cread<width, true>(twiddle + 0)));
-    cwrite_split<width, aligned, write_split>(out + N4 * 3,
-                                              radix4_apply_twiddle(csize<width>, ctrue, cbool<inverse>, w3,
-                                                                   cread<width, true>(twiddle + width * 2)));
+        out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1,
+                                                           cread<width, true>(twiddle + 0)));
+    cwrite_split<width, aligned, write_split>(
+        out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3,
+                                           cread<width, true>(twiddle + width * 2)));
 }
 
 template <typename T>
@@ -254,30 +258,29 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s
     }
 }
 
-template <typename T>
-KFR_SINTRIN void prefetch_one(const complex<T>* in)
-{
 #ifdef CMT_ARCH_X86
-    __builtin_prefetch(ptr_cast<void>(in), 0, _MM_HINT_T0);
+#ifdef CMT_COMPILER_GNU
+#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
+#else
+#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0);
+#endif
 #else
-    __builtin_prefetch(ptr_cast<void>(in));
+#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr));
 #endif
+
+template <typename T>
+KFR_SINTRIN void prefetch_one(const complex<T>* in)
+{
+    KFR_PREFETCH(in);
 }
 
 template <typename T>
 KFR_SINTRIN void prefetch_four(size_t stride, const complex<T>* in)
 {
-#ifdef CMT_ARCH_X86
-    __builtin_prefetch(ptr_cast<void>(in), 0, _MM_HINT_T0);
-    __builtin_prefetch(ptr_cast<void>(in + stride), 0, _MM_HINT_T0);
-    __builtin_prefetch(ptr_cast<void>(in + stride * 2), 0, _MM_HINT_T0);
-    __builtin_prefetch(ptr_cast<void>(in + stride * 3), 0, _MM_HINT_T0);
-#else
-    __builtin_prefetch(ptr_cast<void>(in));
-    __builtin_prefetch(ptr_cast<void>(in + stride));
-    __builtin_prefetch(ptr_cast<void>(in + stride * 2));
-    __builtin_prefetch(ptr_cast<void>(in + stride * 3));
-#endif
+    KFR_PREFETCH(in);
+    KFR_PREFETCH(in + stride);
+    KFR_PREFETCH(in + stride * 2);
+    KFR_PREFETCH(in + stride * 3);
 }
 
 template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
@@ -287,20 +290,21 @@ KFR_SINTRIN cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t
                                  complex<T>* out, const complex<T>* in, const complex<T>*& twiddle)
 {
     constexpr static size_t prefetch_offset = width * 8;
-    const auto N4                           = N / csize<4>;
-    const auto N43                          = N4 * csize<3>;
+    const auto N4                           = N / csize_t<4>();
+    const auto N43                          = N4 * csize_t<3>();
     CMT_ASSUME(blocks > 0);
     CMT_ASSUME(N > 0);
     CMT_ASSUME(N4 > 0);
     CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
     {
-#pragma clang loop unroll_count(2)
+        CMT_PRAGMA_CLANG(clang loop unroll_count(2))
         for (size_t n2 = 0; n2 < N4; n2 += width)
         {
             if (prefetch)
                 prefetch_four(N4, in + prefetch_offset);
-            radix4_body(N, csize<width>, cbool < splitout || splitin >, cbool<splitout>, cbool<splitin>,
-                        cbool<use_br2>, cbool<inverse>, cbool<aligned>, out, in, twiddle + n2 * 3);
+            radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
+                        cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
+                        in, twiddle + n2 * 3);
             in += width;
             out += width;
         }
@@ -321,7 +325,7 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfal
     for (size_t b = 0; b < blocks; b++)
     {
         if (prefetch)
-            prefetch_four(csize<64>, out + prefetch_offset);
+            prefetch_four(csize_t<64>(), out + prefetch_offset);
         cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
         split(cread<8, aligned>(out + 0), w0, w1);
         split(cread<8, aligned>(out + 8), w2, w3);
@@ -330,13 +334,13 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfal
 
         butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
 
-        w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>);
-        w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>);
-        w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>);
-        w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>);
-        w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>);
-        w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>);
-        w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>);
+        w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
+        w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
+        w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
+        w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
+        w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
+        w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
+        w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
 
         cvec<T, 8> z0, z1, z2, z3;
         transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
@@ -380,7 +384,7 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfal
 {
     CMT_ASSUME(blocks > 0);
     constexpr static size_t prefetch_offset = width * 4;
-#pragma clang loop unroll_count(2)
+    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
     for (size_t b = 0; b < blocks; b += 2)
     {
         if (prefetch)
@@ -434,13 +438,14 @@ struct fft_stage_impl : dft_stage<T>
         this->stage_size = stage_size;
         this->repeats    = 4;
         this->recursion  = true;
-        this->data_size  = align_up(sizeof(complex<T>) * stage_size / 4 * 3, native_cache_alignment);
+        this->data_size =
+            align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
     }
 
 protected:
     constexpr static bool prefetch = true;
     constexpr static bool aligned  = false;
-    constexpr static size_t width  = vector_width<T, cpu_t::native>;
+    constexpr static size_t width  = platform<T>::vector_width;
 
     virtual void do_initialize(size_t size) override final
     {
@@ -452,12 +457,12 @@ protected:
     {
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         if (splitin)
-            in                  = out;
-        const size_t stage_size = this->stage_size;
-        CMT_ASSUME(stage_size >= 2048);
-        CMT_ASSUME(stage_size % 2048 == 0);
-        radix4_pass(stage_size, 1, csize<width>, ctrue, cbool<splitin>, cbool<!is_even>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, in, twiddle);
+            in                = out;
+        const size_t stg_size = this->stage_size;
+        CMT_ASSUME(stg_size >= 2048);
+        CMT_ASSUME(stg_size % 2048 == 0);
+        radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
     }
 };
 
@@ -470,11 +475,11 @@ struct fft_final_stage_impl : dft_stage<T>
         this->out_offset = size;
         this->repeats    = 4;
         this->recursion  = true;
-        this->data_size  = align_up(sizeof(complex<T>) * size * 3 / 2, native_cache_alignment);
+        this->data_size  = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment);
     }
 
 protected:
-    constexpr static size_t width  = vector_width<T, cpu_t::native>;
+    constexpr static size_t width  = platform<T>::vector_width;
     constexpr static bool is_even  = cometa::is_even(ilog2(size));
     constexpr static bool use_br2  = !is_even;
     constexpr static bool aligned  = false;
@@ -483,11 +488,11 @@ protected:
     virtual void do_initialize(size_t total_size) override final
     {
         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        size_t stage_size   = this->stage_size;
-        while (stage_size > 4)
+        size_t stg_size     = this->stage_size;
+        while (stg_size > 4)
         {
-            initialize_twiddles<T, width>(twiddle, stage_size, total_size, true);
-            stage_size /= 4;
+            initialize_twiddles<T, width>(twiddle, stg_size, total_size, true);
+            stg_size /= 4;
         }
     }
 
@@ -496,55 +501,55 @@ protected:
         constexpr bool is_double    = sizeof(T) == 8;
         constexpr size_t final_size = is_even ? (is_double ? 4 : 16) : (is_double ? 8 : 32);
         const complex<T>* twiddle   = ptr_cast<complex<T>>(this->data);
-        final_pass(csize<final_size>, out, in, twiddle);
+        final_pass(csize_t<final_size>(), out, in, twiddle);
     }
 
     KFR_INTRIN void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
     {
-        radix4_pass(512, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, in, twiddle);
-        radix4_pass(128, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(32, 16, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(csize<8>, 64, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, out, twiddle);
+        radix4_pass(512, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(128, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(32, 16, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<8>(), 64, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 
     KFR_INTRIN void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
     {
-        radix4_pass(512, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, in, twiddle);
-        radix4_pass(128, 4, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(csize<32>, 16, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, out, twiddle);
+        radix4_pass(512, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(128, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<32>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 
     KFR_INTRIN void final_pass(csize_t<4>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
     {
-        radix4_pass(1024, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, in, twiddle);
-        radix4_pass(256, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(64, 16, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(16, 64, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(csize<4>, 256, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, out, twiddle);
+        radix4_pass(1024, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(256, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(64, 16, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(16, 64, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<4>(), 256, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 
     KFR_INTRIN void final_pass(csize_t<16>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
     {
-        radix4_pass(1024, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, in, twiddle);
-        radix4_pass(256, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(64, 16, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(csize<16>, 64, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, out, twiddle);
+        radix4_pass(1024, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(256, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(64, 16, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<16>(), 64, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 };
 
@@ -565,7 +570,7 @@ protected:
 
     virtual void do_execute(complex<T>* out, const complex<T>*, u8* /*temp*/) override final
     {
-        fft_reorder(out, log2n, cbool<!is_even>);
+        fft_reorder(out, log2n, cbool_t<!is_even>());
     }
 };
 
@@ -596,7 +601,7 @@ protected:
     {
         cvec<T, 1> a0, a1, a2, a3;
         split(cread<4>(in), a0, a1, a2, a3);
-        butterfly(cbool<inverse>, a0, a1, a2, a3, a0, a1, a2, a3);
+        butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3);
         cwrite<4>(out, concat(a0, a1, a2, a3));
     }
 };
@@ -651,7 +656,7 @@ protected:
     constexpr static bool aligned = false;
     virtual void do_execute(complex<T>* out, const complex<T>* in, u8*) override final
     {
-        butterfly64(cbool<inverse>, cbool<aligned>, out, in);
+        butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
     }
 };
 
@@ -661,12 +666,12 @@ struct fft_specialization<T, 7, inverse> : dft_stage<T>
     fft_specialization(size_t)
     {
         this->stage_size = 128;
-        this->data_size  = align_up(sizeof(complex<T>) * 128 * 3 / 2, native_cache_alignment);
+        this->data_size  = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
     }
 
 protected:
     constexpr static bool aligned        = false;
-    constexpr static size_t width        = vector_width<T, cpu_t::native>;
+    constexpr static size_t width        = platform<T>::vector_width;
     constexpr static bool use_br2        = true;
     constexpr static bool prefetch       = false;
     constexpr static bool is_double      = sizeof(T) == 8;
@@ -684,26 +689,26 @@ protected:
     virtual void do_execute(complex<T>* out, const complex<T>* in, u8* /*temp*/) override final
     {
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        final_pass(csize<final_size>, out, in, twiddle);
-        fft_reorder(out, csize<7>);
+        final_pass(csize_t<final_size>(), out, in, twiddle);
+        fft_reorder(out, csize_t<7>());
     }
 
     KFR_INTRIN void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
     {
-        radix4_pass(128, 1, csize<width>, ctrue, cfalse, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, in, twiddle);
-        radix4_pass(32, 4, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(csize<8>, 16, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, out, twiddle);
+        radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 
     KFR_INTRIN void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
     {
-        radix4_pass(128, 1, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, in, twiddle);
-        radix4_pass(csize<32>, 4, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, out, twiddle);
+        radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 };
 
@@ -749,12 +754,12 @@ struct fft_specialization<double, 8, inverse> : dft_stage<double>
     fft_specialization(size_t)
     {
         this->stage_size = 256;
-        this->data_size  = align_up(sizeof(complex<T>) * 256 * 3 / 2, native_cache_alignment);
+        this->data_size  = align_up(sizeof(complex<T>) * 256 * 3 / 2, platform<>::native_cache_alignment);
     }
 
 protected:
     constexpr static bool aligned        = false;
-    constexpr static size_t width        = vector_width<T, cpu_t::native>;
+    constexpr static size_t width        = platform<T>::vector_width;
     constexpr static bool use_br2        = false;
     constexpr static bool prefetch       = false;
     constexpr static size_t split_format = true;
@@ -770,20 +775,20 @@ protected:
     virtual void do_execute(complex<T>* out, const complex<T>* in, u8* /*temp*/) override final
     {
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        final_pass(csize<4>, out, in, twiddle);
-        fft_reorder(out, csize<8>);
+        final_pass(csize_t<4>(), out, in, twiddle);
+        fft_reorder(out, csize_t<8>());
     }
 
     KFR_INTRIN void final_pass(csize_t<4>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
     {
-        radix4_pass(256, 1, csize<width>, ctrue, cfalse, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, in, twiddle);
-        radix4_pass(64, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(16, 16, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>,
-                    cbool<aligned>, out, out, twiddle);
-        radix4_pass(csize<4>, 64, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>,
-                    cbool<inverse>, cbool<aligned>, out, out, twiddle);
+        radix4_pass(256, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(64, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(16, 16, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<4>(), 64, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 };
 
@@ -835,15 +840,16 @@ struct dft_plan
         if (is_poweroftwo(size))
         {
             const size_t log2n = ilog2(size);
-            cswitch(csizes<1, 2, 3, 4, 5, 6, 7, 8>, log2n,
+            cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8>(), log2n,
                     [&](auto log2n) {
-                        add_stage<internal::fft_specialization_t<T, val_of(decltype(log2n)()),
-                                                                 false>::template type>(size, type);
+                        (void)log2n;
+                        this->add_stage<internal::fft_specialization_t<T, val_of(decltype(log2n)()),
+                                                                       false>::template type>(size, type);
                     },
                     [&]() {
                         cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
-                            make_fft(size, type, is_even, ctrue);
-                            add_stage<internal::fft_reorder_stage_impl_t<
+                            this->make_fft(size, type, is_even, ctrue);
+                            this->add_stage<internal::fft_reorder_stage_impl_t<
                                 T, val_of(decltype(is_even)())>::template type>(size, type);
                         });
                     });
@@ -927,7 +933,7 @@ protected:
         {
             add_stage<fft_stage_impl_t::template type>(stage_size, type);
 
-            make_fft(stage_size / 4, cbools<direct, inverse>, cbool<is_even>, cfalse);
+            make_fft(stage_size / 4, cbools_t<direct, inverse>(), cbool_t<is_even>(), cfalse);
         }
         else
         {
@@ -1021,15 +1027,13 @@ struct dft_plan_real : dft_plan<T>
         : dft_plan<T>(size / 2, type), rtwiddle(size / 4)
     {
         using namespace internal;
-        const size_t csize = size / 2;
-        const size_t count = size / 4;
 
-        constexpr size_t width = vector_width<T> * 2;
+        constexpr size_t width = platform<T>::vector_width * 2;
 
-        block_process(count, csizes<width, 1>, [=](size_t i, auto w) {
+        block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
             constexpr size_t width = val_of(decltype(w)());
             cwrite<width>(rtwiddle.data() + i,
-                          cossin(dup(-c_pi<T> * ((enumerate<T, width>() + i + count) / csize))));
+                          cossin(dup(-c_pi<T> * ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
         });
     }
 
@@ -1069,13 +1073,13 @@ private:
     void to_fmt(complex<T>* out, dft_pack_format fmt) const
     {
         using namespace internal;
-        const size_t csize = this->size; // / 2;
+        size_t csize = this->size; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
 
-        constexpr size_t width = vector_width<T> * 2;
+        constexpr size_t width = platform<T>::vector_width * 2;
         const cvec<T, 1> dc = cread<1>(out);
         const size_t count = csize / 2;
 
-        block_process(count, csizes<width, 1>, [=](size_t i, auto w) {
+        block_process(count, csizes_t<width, 1>(), [=](size_t i, auto w) {
             constexpr size_t width   = val_of(decltype(w)());
             constexpr size_t widthm1 = width - 1;
             const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
@@ -1122,10 +1126,10 @@ private:
             dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag());
         }
 
-        constexpr size_t width = vector_width<T> * 2;
+        constexpr size_t width = platform<T>::vector_width * 2;
         const size_t count     = csize / 2;
 
-        block_process(count, csizes<width, 1>, [=](size_t i, auto w) {
+        block_process(count, csizes_t<width, 1>(), [=](size_t i, auto w) {
             constexpr size_t width   = val_of(decltype(w)());
             constexpr size_t widthm1 = width - 1;
             const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
@@ -1150,4 +1154,6 @@ private:
 };
 }
 
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp
@@ -37,6 +37,9 @@
 #include "../base/memory.hpp"
 #include "../data/sincos.hpp"
 
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4127))
+
 namespace kfr
 {
 
@@ -44,18 +47,18 @@ namespace internal
 {
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y)
 {
     return subadd(x * dupeven(y), swap<2>(x) * dupodd(y));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, 2> y)
+CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y)
 {
     vec<T, N> yy = resize<N>(y);
     return cmul_impl(x, yy);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_impl(vec<T, 2> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
 {
     vec<T, N> xx = resize<N>(x);
     return cmul_impl(xx, y);
@@ -63,23 +66,24 @@ CMT_INLINE vec<T, N> cmul_impl(vec<T, 2> x, vec<T, N> y)
 
 /// Complex Multiplication
 template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<T, const_max(N1, N2)> cmul(vec<T, N1> x, vec<T, N2> y)
+CMT_INLINE vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     return internal::cmul_impl(x, y);
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y)
 {
     return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_2conj(vec<T, N> in0, vec<T, N> in1, vec<T, N> tw)
+CMT_INLINE vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw)
 {
     return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, vec<T, 2> in0, vec<T, 2> in1, vec<T, N> tw)
+CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1,
+                           const vec<T, N>& tw)
 {
     const vec<T, N> twr   = dupeven(tw);
     const vec<T, N> twi   = dupodd(tw);
@@ -91,13 +95,13 @@ CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, vec<T, 2> in0, vec<
     out1 += sumtw - diftw;
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, 2> y)
+CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y)
 {
     vec<T, N> yy = resize<N>(y);
     return cmul_conj(x, yy);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_conj(vec<T, 2> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y)
 {
     vec<T, N> xx = resize<N>(x);
     return cmul_conj(xx, y);
@@ -113,7 +117,7 @@ CMT_INLINE cvec<T, N> cread(const complex<T>* src)
 }
 
 template <size_t N, bool A = false, typename T>
-CMT_INLINE void cwrite(complex<T>* dest, cvec<T, N> value)
+CMT_INLINE void cwrite(complex<T>* dest, const cvec<T, N>& value)
 {
     return simd_write<A, N * 2>(ptr_cast<T>(dest), *value);
 }
@@ -124,7 +128,7 @@ CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<i
     return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
 }
 template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
-CMT_INLINE void cwrite_group_impl(complex<T>* dest, cvec<T, count * N> value, csizes_t<indices...>)
+CMT_INLINE void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>)
 {
     swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
 }
@@ -135,7 +139,7 @@ CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t str
     return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
 }
 template <size_t count, size_t N, bool A, typename T, size_t... indices>
-CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, cvec<T, count * N> value,
+CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value,
                                   csizes_t<indices...>)
 {
     swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
@@ -144,25 +148,25 @@ CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, cvec<T, count
 template <size_t count, size_t N, size_t stride, bool A = false, typename T>
 CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src)
 {
-    return cread_group_impl<count, N, stride, A>(src, csizeseq<count>);
+    return cread_group_impl<count, N, stride, A>(src, csizeseq_t<count>());
 }
 
 template <size_t count, size_t N, size_t stride, bool A = false, typename T>
-CMT_INLINE void cwrite_group(complex<T>* dest, cvec<T, count * N> value)
+CMT_INLINE void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value)
 {
-    return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq<count>);
+    return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq_t<count>());
 }
 
 template <size_t count, size_t N, bool A = false, typename T>
 CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
 {
-    return cread_group_impl<count, N, A>(src, stride, csizeseq<count>);
+    return cread_group_impl<count, N, A>(src, stride, csizeseq_t<count>());
 }
 
 template <size_t count, size_t N, bool A = false, typename T>
-CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, cvec<T, count * N> value)
+CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value)
 {
-    return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq<count>);
+    return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq_t<count>());
 }
 
 template <size_t N, bool A = false, bool split = false, typename T>
@@ -175,11 +179,12 @@ CMT_INLINE cvec<T, N> cread_split(const complex<T>* src)
 }
 
 template <size_t N, bool A = false, bool split = false, typename T>
-CMT_INLINE void cwrite_split(complex<T>* dest, cvec<T, N> value)
+CMT_INLINE void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
 {
+    cvec<T, N> v = value;
     if (split)
-        value = interleavehalfs(value);
-    simd_write<A, N * 2>(ptr_cast<T>(dest), *value);
+        v = interleavehalfs(v);
+    simd_write<A, N * 2>(ptr_cast<T>(dest), *v);
 }
 
 template <>
@@ -209,47 +214,51 @@ inline cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src)
 }
 
 template <>
-inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, cvec<f32, 8> x)
+inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x)
 {
-    x = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
+    const cvec<f32, 8> xx =
+        concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
 
     cvec<f32, 2> a, b, c, d;
-    split(x, a, b, c, d);
+    split(xx, a, b, c, d);
     cwrite<2>(dest, a);
     cwrite<2>(dest + 4, b);
     cwrite<2>(dest + 2, c);
     cwrite<2>(dest + 6, d);
 }
 template <>
-inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, cvec<f32, 8> x)
+inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x)
 {
-    x = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
+    const cvec<f32, 8> xx =
+        concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
 
     cvec<f32, 2> a, b, c, d;
-    split(x, a, b, c, d);
-    cwrite<2, true>(dest, a);
+    split(xx, a, b, c, d);
+    cwrite<2, true>(dest + 0, a);
     cwrite<2, true>(dest + 4, b);
     cwrite<2, true>(dest + 2, c);
     cwrite<2, true>(dest + 6, d);
 }
 
 template <>
-inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, cvec<f64, 4> x)
+inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x)
 {
-    x = concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x)));
-    cwrite<1>(dest, part<4, 0>(x));
-    cwrite<1>(dest + 2, part<4, 1>(x));
-    cwrite<1>(dest + 1, part<4, 2>(x));
-    cwrite<1>(dest + 3, part<4, 3>(x));
+    const cvec<f64, 4> xx =
+        concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x)));
+    cwrite<1>(dest, part<4, 0>(xx));
+    cwrite<1>(dest + 2, part<4, 1>(xx));
+    cwrite<1>(dest + 1, part<4, 2>(xx));
+    cwrite<1>(dest + 3, part<4, 3>(xx));
 }
 template <>
-inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, cvec<f64, 4> x)
+inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x)
 {
-    x = concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x)));
-    cwrite<1, true>(dest, part<4, 0>(x));
-    cwrite<1, true>(dest + 2, part<4, 1>(x));
-    cwrite<1, true>(dest + 1, part<4, 2>(x));
-    cwrite<1, true>(dest + 3, part<4, 3>(x));
+    const cvec<f64, 4> xx =
+        concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x)));
+    cwrite<1, true>(dest + 0, part<4, 0>(xx));
+    cwrite<1, true>(dest + 2, part<4, 1>(xx));
+    cwrite<1, true>(dest + 1, part<4, 2>(xx));
+    cwrite<1, true>(dest + 3, part<4, 3>(xx));
 }
 
 template <size_t N, size_t stride, typename T, size_t... Indices>
@@ -266,7 +275,7 @@ CMT_INLINE cvec<T, N> cgather(const complex<T>* base)
         return ref_cast<cvec<T, N>>(*base);
     }
     else
-        return cgather_helper<N, stride, T>(base, csizeseq<N>);
+        return cgather_helper<N, stride, T>(base, csizeseq_t<N>());
 }
 
 CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
@@ -294,13 +303,13 @@ CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size
 template <size_t N, typename T>
 CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
 {
-    return cgather_helper<N, T>(base, index, stride, csizeseq<N>);
+    return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
 }
 template <size_t N, typename T>
 CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t stride)
 {
     size_t index = 0;
-    return cgather_helper<N, T>(base, index, stride, csizeseq<N>);
+    return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
 }
 
 template <size_t N, typename T, size_t... Indices>
@@ -313,17 +322,17 @@ CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size
 template <size_t N, typename T>
 CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
 {
-    return cgather_helper<N, T>(base, index, stride, size, csizeseq<N>);
+    return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>());
 }
 
 template <size_t N, size_t stride, typename T, size_t... Indices>
-CMT_INLINE void cscatter_helper(complex<T>* base, cvec<T, N> value, csizes_t<Indices...>)
+CMT_INLINE void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>)
 {
     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
 }
 
 template <size_t N, size_t stride, typename T>
-CMT_INLINE void cscatter(complex<T>* base, cvec<T, N> value)
+CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value)
 {
     if (stride == 1)
     {
@@ -331,38 +340,39 @@ CMT_INLINE void cscatter(complex<T>* base, cvec<T, N> value)
     }
     else
     {
-        return cscatter_helper<N, stride, T>(base, value, csizeseq<N>);
+        return cscatter_helper<N, stride, T>(base, value, csizeseq_t<N>());
     }
 }
 
 template <size_t N, typename T, size_t... Indices>
-CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, cvec<T, N> value, csizes_t<Indices...>)
+CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value,
+                                csizes_t<Indices...>)
 {
     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
 }
 
 template <size_t N, typename T>
-CMT_INLINE void cscatter(complex<T>* base, size_t stride, cvec<T, N> value)
+CMT_INLINE void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value)
 {
-    return cscatter_helper<N, T>(base, stride, value, csizeseq<N>);
+    return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>());
 }
 
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, vec<IT, N> offset)
+CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset)
 {
-    return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq<N>);
+    return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>());
 }
 
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE void cscatter(complex<T>* base, vec<IT, N> offset, vec<T, N * 2 * groupsize> value)
+CMT_INLINE void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value)
 {
-    return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq<N>);
+    return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>());
 }
 
 template <typename T>
-KFR_INTRIN void transpose4x8(cvec<T, 8> z0, cvec<T, 8> z1, cvec<T, 8> z2, cvec<T, 8> z3, cvec<T, 4>& w0,
-                             cvec<T, 4>& w1, cvec<T, 4>& w2, cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5,
-                             cvec<T, 4>& w6, cvec<T, 4>& w7)
+KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2,
+                             const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2,
+                             cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, cvec<T, 4>& w7)
 {
     cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3));
     cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3));
@@ -379,8 +389,9 @@ KFR_INTRIN void transpose4x8(cvec<T, 8> z0, cvec<T, 8> z1, cvec<T, 8> z2, cvec<T
 }
 
 template <typename T>
-KFR_INTRIN void transpose4x8(cvec<T, 4> w0, cvec<T, 4> w1, cvec<T, 4> w2, cvec<T, 4> w3, cvec<T, 4> w4,
-                             cvec<T, 4> w5, cvec<T, 4> w6, cvec<T, 4> w7, cvec<T, 8>& z0, cvec<T, 8>& z1,
+KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2,
+                             const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5,
+                             const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1,
                              cvec<T, 8>& z2, cvec<T, 8>& z3)
 {
     cvec<T, 16> a = concat(w0, w1, w2, w3);
@@ -453,20 +464,20 @@ constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...
                                     : cos_using_table<T>(size, indices / 2 * step + start))...);
 }
 
-template <typename T, size_t width, size_t size, size_t start, size_t step, bool inverse = false>
-constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle()
+template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false>
+constexpr KFR_INTRIN cvec<T, width> fixed_twiddle()
 {
-    return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq<width * 2>);
+    return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>());
 }
 
 template <typename T, size_t width>
-constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle(size_t size, size_t start, size_t step = 0)
+constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0)
 {
-    return get_fixed_twiddle_helper<T, width>(csizeseq<width * 2>, start, step, size);
+    return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size);
 }
 
-template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false>
-constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>();
+// template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false>
+// constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>();
 
 template <typename T, size_t N, bool inverse>
 constexpr cvec<T, N> twiddleimagmask()
@@ -474,19 +485,19 @@ constexpr cvec<T, N> twiddleimagmask()
     return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1);
 }
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wconversion"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wconversion")
 
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 
 template <typename T, size_t N>
-CMT_NOINLINE static vec<T, N> cossin_conj(vec<T, N> x)
+CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x)
 {
     return cconj(cossin(x));
 }
 
 template <size_t k, size_t size, bool inverse = false, typename T, size_t width>
-KFR_INTRIN vec<T, width> cmul_by_twiddle(vec<T, width> x)
+KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
 {
     constexpr size_t kk = (inverse ? size - k : k) % size;
     constexpr T isqrt2  = static_cast<T>(0.70710678118654752440084436210485);
@@ -524,12 +535,12 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(vec<T, width> x)
     }
     else
     {
-        return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>));
+        return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>()));
     }
 }
 
 template <size_t N, typename T>
-KFR_INTRIN void butterfly2(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N>& w0, cvec<T, N>& w1)
+KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
 {
     w0 = a0 + a1;
     w1 = a0 - a1;
@@ -542,8 +553,9 @@ KFR_INTRIN void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1)
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2,
-                           cvec<T, N> a3, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
+                           const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
+                           cvec<T, N>& w2, cvec<T, N>& w3)
 {
     cvec<T, N> sum02, sum13, diff02, diff13;
     cvec<T, N * 2> a01, a23, sum0213, diff0213;
@@ -575,8 +587,9 @@ KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, cvec<T, N> a0, cvec<T, N> 
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2,
-                           cvec<T, N> a3, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
+                           const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
+                           cvec<T, N>& w2, cvec<T, N>& w3)
 {
     vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3;
     vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3;
@@ -601,15 +614,16 @@ KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, cvec<T, N> a0, cvec<T, N> a
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> a4,
-                           cvec<T, N> a5, cvec<T, N> a6, cvec<T, N> a7, cvec<T, N>& w0, cvec<T, N>& w1,
+KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                           const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+                           const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1,
                            cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6,
                            cvec<T, N>& w7)
 {
     cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6;
-    butterfly4<N, inverse>(cbool<false>, b0, b2, b4, b6, b0, b2, b4, b6);
+    butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6);
     cvec<T, N> b1 = a1, b3 = a3, b5 = a5, b7 = a7;
-    butterfly4<N, inverse>(cbool<false>, b1, b3, b5, b7, b1, b3, b5, b7);
+    butterfly4<N, inverse>(cfalse, b1, b3, b5, b7, b1, b3, b5, b7);
     w0 = b0 + b1;
     w4 = b0 - b1;
 
@@ -637,7 +651,7 @@ KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cv
 {
     cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67;
 
-    butterfly4<2, inverse>(cbool<false>, b01, b23, b45, b67, b01, b23, b45, b67);
+    butterfly4<2, inverse>(cfalse, b01, b23, b45, b67, b01, b23, b45, b67);
 
     cvec<T, 2> b02, b13, b46, b57;
 
@@ -645,8 +659,8 @@ KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cv
     cvec<T, 8> b02461357 = concat(even<2>(b01234567), odd<2>(b01234567));
     split(b02461357, b02, b46, b13, b57);
 
-    b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>);
-    b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>);
+    b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>());
+    b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>());
     a01 = b02 + b13;
     a23 = b46 + b57;
     a45 = b02 - b13;
@@ -669,13 +683,13 @@ KFR_INTRIN void butterfly32(cvec<T, 32>& v32)
     split(v32, w0, w1, w2, w3, w4, w5, w6, w7);
     butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
 
-    w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>);
-    w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>);
-    w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>);
-    w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>);
-    w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>);
-    w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>);
-    w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>);
+    w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
+    w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
+    w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
+    w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
+    w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
+    w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
+    w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
 
     cvec<T, 8> z0, z1, z2, z3;
     transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
@@ -707,7 +721,7 @@ KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01)
 }
 
 template <size_t N, bool inverse = false, bool split_format = false, typename T>
-KFR_INTRIN void apply_twiddle(cvec<T, N> a1, cvec<T, N> tw1, cvec<T, N>& w1)
+KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
 {
     if (split_format)
     {
@@ -723,18 +737,20 @@ KFR_INTRIN void apply_twiddle(cvec<T, N> a1, cvec<T, N> tw1, cvec<T, N>& w1)
     }
     else
     {
-        cvec<T, N> b1 = a1 * dupeven(tw1);
-        a1 = swap<2>(a1);
+        const cvec<T, N> b1  = a1 * dupeven(tw1);
+        const cvec<T, N> a1_ = swap<2>(a1);
 
+        cvec<T, N> tw1_ = tw1;
         if (inverse)
-            tw1 = -(tw1);
-        w1      = subadd(b1, a1 * dupodd(tw1));
+            tw1_ = -(tw1_);
+        w1       = subadd(b1, a1_ * dupodd(tw1_));
     }
 }
 
 template <size_t N, bool inverse = false, bool split_format = false, typename T>
-KFR_INTRIN void apply_twiddles4(cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> tw1, cvec<T, N> tw2,
-                                cvec<T, N> tw3, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3,
+                                const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3,
+                                cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
 {
     apply_twiddle<N, inverse, split_format>(a1, tw1, w1);
     apply_twiddle<N, inverse, split_format>(a2, tw2, w2);
@@ -743,14 +759,16 @@ KFR_INTRIN void apply_twiddles4(cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cve
 
 template <size_t N, bool inverse = false, typename T>
 KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
-                                cvec<T, N>& __restrict a3, cvec<T, N> tw1, cvec<T, N> tw2, cvec<T, N> tw3)
+                                cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2,
+                                const cvec<T, N>& tw3)
 {
     apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3);
 }
 
 template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]>
 KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
-                                cvec<T, N>& __restrict a3, cvec<T, 1> tw1, cvec<T, 1> tw2, cvec<T, 1> tw3)
+                                cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2,
+                                const cvec<T, 1>& tw3)
 {
     apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3));
 }
@@ -800,9 +818,9 @@ KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
     cvec<T, N> a3;
     split(a0123, a0, a1, a2, a3);
 
-    cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>,
-               tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>,
-               tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>;
+    cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>(),
+               tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>(),
+               tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>();
 
     apply_twiddles4<N>(a1, a2, a3, tw1, tw2, tw3);
 
@@ -984,23 +1002,23 @@ KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
     w3 = digitreverse4<2>(w3);
 
     transpose4(w0, w1, w2, w3);
-    cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>));
-    cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>));
-    cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>));
-    cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>));
+    cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>()));
+    cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>()));
+    cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>()));
+    cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>()));
 }
 
 template <size_t n2, size_t nnstep, size_t N, typename T>
 KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1)
 {
-    cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>;
+    cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>();
 
     a1 = cmul(a1, tw1);
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01,
-                           cvec<T, N>& w02)
+KFR_INTRIN void butterfly3(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
+                           cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02)
 {
     constexpr cvec<T, N> tw3r1 = static_cast<T>(-0.5);
     constexpr cvec<T, N> tw3i1 =
@@ -1025,9 +1043,9 @@ KFR_INTRIN void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2)
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly6(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> a4,
-                           cvec<T, N> a5, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3,
-                           cvec<T, N>& w4, cvec<T, N>& w5)
+KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                           const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, cvec<T, N>& w0,
+                           cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5)
 {
     cvec<T, N* 2> a03 = concat(a0, a3);
     cvec<T, N* 2> a25 = concat(a2, a5);
@@ -1056,8 +1074,9 @@ KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
-                           cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
+KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
+                           const cvec<T, N>& a03, const cvec<T, N>& a04, const cvec<T, N>& a05,
+                           const cvec<T, N>& a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
                            cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06)
 {
     constexpr cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884);
@@ -1102,9 +1121,9 @@ KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly5(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
-                           cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, cvec<T, N>& w03,
-                           cvec<T, N>& w04)
+KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
+                           const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
+                           cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
 {
     constexpr cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718);
     constexpr cvec<T, N> tw5i1 =
@@ -1132,10 +1151,12 @@ KFR_INTRIN void butterfly5(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly10(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> a4,
-                            cvec<T, N> a5, cvec<T, N> a6, cvec<T, N> a7, cvec<T, N> a8, cvec<T, N> a9,
-                            cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4,
-                            cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8, cvec<T, N>& w9)
+KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                            const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+                            const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
+                            const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2,
+                            cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7,
+                            cvec<T, N>& w8, cvec<T, N>& w9)
 {
     cvec<T, N* 2> a05 = concat(a0, a5);
     cvec<T, N* 2> a27 = concat(a2, a7);
@@ -1170,64 +1191,69 @@ KFR_INTRIN void butterfly10(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T,
 }
 
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N>& out0, vec<T, N>& out1)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0,
+                          vec<T, N>& out1)
 {
     butterfly2<N / 2>(in0, in1, out0, out1);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N>& out0,
-                          vec<T, N>& out1, vec<T, N>& out2)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2)
 {
     butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2);
 }
 
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3,
-                          vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
+                          vec<T, N>& out3)
 {
     butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3,
-                          vec<T, N> in4, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
-                          vec<T, N>& out4)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, const vec<T, N>& in4, vec<T, N>& out0, vec<T, N>& out1,
+                          vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4)
 {
     butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3,
-                          vec<T, N> in4, vec<T, N> in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
-                          vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, vec<T, N>& out0,
+                          vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5)
 {
     butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3,
-                          vec<T, N> in4, vec<T, N> in5, vec<T, N> in6, vec<T, N>& out0, vec<T, N>& out1,
-                          vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
+                          const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
+                          vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6)
 {
     butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3,
-                          vec<T, N> in4, vec<T, N> in5, vec<T, N> in6, vec<T, N> in7, vec<T, N>& out0,
-                          vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
-                          vec<T, N>& out6, vec<T, N>& out7)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
+                          const vec<T, N>& in6, const vec<T, N>& in7, vec<T, N>& out0, vec<T, N>& out1,
+                          vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
+                          vec<T, N>& out7)
 {
     butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5,
                                out6, out7);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3,
-                          vec<T, N> in4, vec<T, N> in5, vec<T, N> in6, vec<T, N> in7, vec<T, N> in8,
-                          vec<T, N> in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
-                          vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8,
-                          vec<T, N>& out9)
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
+                          const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
+                          const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
+                          vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
+                          vec<T, N>& out8, vec<T, N>& out9)
 {
     butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3,
                                 out4, out5, out6, out7, out8, out9);
 }
-template <bool transposed, typename T, size_t... N, size_t Nout = csum(csizes<N...>)>
+template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
 KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
 {
     vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr));
@@ -1255,7 +1281,7 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3
     w4 = cgather<4, 5>(ptr + 4);
 }
 
-template <bool transposed, typename T, size_t... N, size_t Nout = csum(csizes<N...>)>
+template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
 KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
 {
     auto temp = concat(args...);
@@ -1265,12 +1291,12 @@ KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N
 }
 
 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
-KFR_INTRIN vec<T, N> mul_tw(cbool_t<false>, vec<T, N> x, const complex<T>* twiddle)
+KFR_INTRIN vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle)
 {
     return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1)));
 }
 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
-KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, vec<T, N> x, const complex<T>* twiddle)
+KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle)
 {
     return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1)));
 }
@@ -1282,13 +1308,14 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize
 {
     carray<cvec<T, width>, radix> inout;
 
-    swallow{ (inout.get(csize<I>) = cread<width>(in + i + stride * I))... };
+    swallow{ (inout.get(csize_t<I>()) = cread<width>(in + i + stride * I))... };
 
-    butterfly(cbool<inverse>, inout.template get<I>()..., inout.template get<I>()...);
+    butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...);
 
-    swallow{ (cwrite<width>(out + i + stride * I,
-                            mul_tw<I, radix>(cbool<inverse>, inout.template get<I>(), tw + i * (radix - 1))),
-              0)... };
+    swallow{ (
+        cwrite<width>(out + i + stride * I,
+                      mul_tw<I, radix>(cbool_t<inverse>(), inout.template get<I>(), tw + i * (radix - 1))),
+        0)... };
 }
 
 // Final
@@ -1299,17 +1326,17 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize
     carray<cvec<T, width>, radix> inout;
 
     //        swallow{ ( inout.get( csize<I> ) = infn( i, I, cvec<T, width>( ) ) )... };
-    cread_transposed(cbool<true>, in + i * radix, inout.template get<I>()...);
+    cread_transposed(ctrue, in + i * radix, inout.template get<I>()...);
 
-    butterfly(cbool<inverse>, inout.template get<I>()..., inout.template get<I>()...);
+    butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...);
 
-    swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize<I>)), 0)... };
+    swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize_t<I>())), 0)... };
 }
 
 template <size_t width, size_t radix, typename... Args>
 KFR_INTRIN void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args)
 {
-    butterfly_helper(csizeseq<radix>, i, csize<width>, csize<radix>, std::forward<Args>(args)...);
+    butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -1321,8 +1348,8 @@ KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&.
 {
     CMT_LOOP_NOUNROLL
     for (; i < count / width * width; i += width)
-        butterfly(i, csize<width>, std::forward<Args>(args)...);
-    butterfly_cycle(i, count, csize<width / 2>, std::forward<Args>(args)...);
+        butterfly(i, csize_t<width>(), std::forward<Args>(args)...);
+    butterfly_cycle(i, count, csize_t<width / 2>(), std::forward<Args>(args)...);
 }
 
 template <size_t width, typename... Args>
@@ -1330,7 +1357,7 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
 {
     CMT_ASSUME(count > 0);
     size_t i = 0;
-    butterfly_cycle(i, count, csize<width>, std::forward<Args>(args)...);
+    butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...);
 }
 
 template <typename T, bool inverse, typename Tstride>
@@ -1376,7 +1403,7 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<in
                             reverse<2>(sum1));
         }
     }
-    generic_butterfly_cycle(csize<width / 2>, radix, cbool<inverse>, out, in, ostride, halfradix,
+    generic_butterfly_cycle(csize_t<width / 2>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix,
                             halfradix_sqr, twiddle, i);
 }
 
@@ -1406,8 +1433,8 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* 
     CMT_ASSUME(halfradix > 0);
     size_t i = 0;
 
-    generic_butterfly_cycle(csize<width>, radix, cbool<inverse>, out, in, ostride, halfradix, halfradix_sqr,
-                            twiddle, i);
+    generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix,
+                            halfradix_sqr, twiddle, i);
 }
 
 template <typename T, bool inverse, typename Tstride = csize_t<1>>
@@ -1419,14 +1446,14 @@ KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* ou
         builtin_memcpy(temp, in, sizeof(complex<T>) * radix);
         in = temp;
     }
-    constexpr size_t width = vector_width<T, cpu_t::native>;
+    constexpr size_t width = platform<T>::vector_width;
 
-    cswitch(csizes<11>, radix,
+    cswitch(csizes_t<11>(), radix,
             [&](auto radix_) CMT_INLINE_LAMBDA {
-                generic_butterfly_w<width>(decltype(radix_)(), cbool<inverse>, out, in, twiddle, ostride);
+                generic_butterfly_w<width>(decltype(radix_)(), cbool_t<inverse>(), out, in, twiddle, ostride);
             },
             [&]() CMT_INLINE_LAMBDA {
-                generic_butterfly_w<width>(radix, cbool<inverse>, out, in, twiddle, ostride);
+                generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
             });
 }
 
@@ -1437,13 +1464,13 @@ template <typename T, size_t N>
 constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T());
 
 template <bool A = false, typename T, size_t N>
-KFR_INTRIN void cbitreverse_write(complex<T>* dest, vec<T, N> x)
+KFR_INTRIN void cbitreverse_write(complex<T>* dest, const vec<T, N>& x)
 {
     cwrite<N / 2, A>(dest, bitreverse<2>(x));
 }
 
 template <bool A = false, typename T, size_t N>
-KFR_INTRIN void cdigitreverse4_write(complex<T>* dest, vec<T, N> x)
+KFR_INTRIN void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x)
 {
     cwrite<N / 2, A>(dest, digitreverse4<2>(x));
 }
@@ -1471,7 +1498,7 @@ KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>*
                   cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15));
 }
 template <>
-KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, vec<f64, 32> x)
+KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x)
 {
     cwrite<1>(dest, part<16, 0>(x));
     cwrite<1>(dest + 4, part<16, 1>(x));
@@ -1496,3 +1523,5 @@ KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, vec<f64
 #endif
 }
 }
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/dsp/biquad.hpp b/include/kfr/dsp/biquad.hpp
@@ -108,7 +108,7 @@ struct biquad_block
     vec<T, filters> b2;
 
     constexpr biquad_block() noexcept : a1(0), a2(0), b0(1), b1(0), b2(0) {}
-    constexpr biquad_block(const biquad_params<T>* bq, size_t count) noexcept
+    CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) noexcept
     {
         count = count > filters ? filters : count;
         for (size_t i = 0; i < count; i++)
@@ -161,7 +161,7 @@ struct expression_biquads : public expression<E1>
         return out;
     }
     KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state,
-                                        vec<T, filters> in)
+                                        const vec<T, filters>& in)
     {
         const vec<T, filters> out = bq.b0 * in + state.s1;
         state.s1 = state.s2 + bq.b1 * in - bq.a1 * out;
@@ -301,7 +301,7 @@ CMT_INLINE internal::expression_biquads_zl<filters, T, internal::arg<E1>> biquad
 template <typename T, typename E1>
 CMT_INLINE expression_pointer<T> biquad_zl(const biquad_params<T>* bq, size_t count, E1&& e1)
 {
-    return cswitch(csizes<1, 2, 4, 8, 16, 32, 64>, next_poweroftwo(count),
+    return cswitch(csizes_t<1, 2, 4, 8, 16, 32, 64>(), next_poweroftwo(count),
                    [&](auto x) {
                        constexpr size_t filters = x;
                        return to_pointer(internal::expression_biquads_zl<filters, T, internal::arg<E1>>(
diff --git a/include/kfr/dsp/delay.hpp b/include/kfr/dsp/delay.hpp
@@ -103,7 +103,7 @@ struct expression_delay<1, E> : expression<E>
  * @endcode
  */
 template <size_t samples = 1, typename E1>
-CMT_INLINE internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize<samples>)
+CMT_INLINE internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>())
 {
     static_assert(samples >= 1 && samples < 1024, "");
     return internal::expression_delay<samples, E1>(std::forward<E1>(e1));
diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp
@@ -61,7 +61,7 @@ struct expression_short_fir : expression<E1>
         vec<T, N> in = this->argument_first(cinput, index, x);
 
         vec<T, N> out = in * taps[0];
-        cfor(csize<1>, csize<tapcount>,
+        cfor(csize_t<1>(), csize_t<tapcount>(),
              [&](auto I) { out = out + concat_and_slice<tapcount - 1 - I, N>(delayline, in) * taps[I]; });
         delayline = concat_and_slice<N, tapcount - 1>(delayline, in);
 
diff --git a/include/kfr/dsp/mixdown.hpp b/include/kfr/dsp/mixdown.hpp
@@ -45,7 +45,7 @@ struct stereo_matrix
     template <typename T, size_t N>
     CMT_INLINE vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const
     {
-        return process(x, csizeseq<N>);
+        return process(x, csizeseq_t<N>());
     }
     template <typename T, size_t N, size_t... indices>
     CMT_INLINE vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const
@@ -56,8 +56,16 @@ struct stereo_matrix
 };
 }
 
-constexpr f64x2x2 matrix_sum_diff() { return { f64x2{ 1, 1 }, f64x2{ 1, -1 } }; }
-constexpr f64x2x2 matrix_halfsum_halfdiff() { return { f64x2{ 0.5, 0.5 }, f64x2{ 0.5, -0.5 } }; }
+template <int = 0>
+CMT_GNU_CONSTEXPR f64x2x2 matrix_sum_diff()
+{
+    return { f64x2{ 1, 1 }, f64x2{ 1, -1 } };
+}
+template <int = 0>
+CMT_GNU_CONSTEXPR f64x2x2 matrix_halfsum_halfdiff()
+{
+    return { f64x2{ 0.5, 0.5 }, f64x2{ 0.5, -0.5 } };
+}
 
 /**
  * @brief Returns template expression that returns the vector of length 2 containing mix of the left and right
diff --git a/include/kfr/dsp/oscillators.hpp b/include/kfr/dsp/oscillators.hpp
@@ -34,7 +34,8 @@ namespace kfr
 template <typename T = fbase>
 auto jaehne(identity<T> magn, size_t size)
 {
-    return truncate(magn * sin(c_pi<T, 1, 2> * sqr(linspace(T(0), T(size), size, false)) / size), size);
+    return truncate(magn * sin(constants<T>::pi_s(1, 2) * sqr(linspace(T(0), T(size), size, false)) / size),
+                    size);
 }
 
 template <typename T = fbase>
@@ -60,78 +61,78 @@ auto phasor(identity<T> frequency, identity<T> sample_rate)
 namespace intrinsics
 {
 template <typename T>
-KFR_SINTRIN T rawsine(T x)
+KFR_SINTRIN T rawsine(const T& x)
 {
-    return intrinsics::fastsin(x * c_pi<T, 2>);
+    return intrinsics::fastsin(x * constants<T>::pi_s(2));
 }
 template <typename T>
-KFR_SINTRIN T sinenorm(T x)
+KFR_SINTRIN T sinenorm(const T& x)
 {
     return intrinsics::rawsine(fract(x));
 }
 template <typename T>
-KFR_SINTRIN T sine(T x)
+KFR_SINTRIN T sine(const T& x)
 {
-    return intrinsics::sinenorm(c_recip_pi<T, 1, 2> * x);
+    return intrinsics::sinenorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_SINTRIN T rawsquare(T x)
+KFR_SINTRIN T rawsquare(const T& x)
 {
     return select(x < T(0.5), T(1), -T(1));
 }
 template <typename T>
-KFR_SINTRIN T squarenorm(T x)
+KFR_SINTRIN T squarenorm(const T& x)
 {
     return intrinsics::rawsquare(fract(x));
 }
 template <typename T>
-KFR_SINTRIN T square(T x)
+KFR_SINTRIN T square(const T& x)
 {
-    return intrinsics::squarenorm(c_recip_pi<T, 1, 2> * x);
+    return intrinsics::squarenorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_SINTRIN T rawsawtooth(T x)
+KFR_SINTRIN T rawsawtooth(const T& x)
 {
     return T(1) - 2 * x;
 }
 template <typename T>
-KFR_SINTRIN T sawtoothnorm(T x)
+KFR_SINTRIN T sawtoothnorm(const T& x)
 {
     return intrinsics::rawsawtooth(fract(x));
 }
 template <typename T>
-KFR_SINTRIN T sawtooth(T x)
+KFR_SINTRIN T sawtooth(const T& x)
 {
-    return intrinsics::sawtoothnorm(c_recip_pi<T, 1, 2> * x);
+    return intrinsics::sawtoothnorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_SINTRIN T isawtoothnorm(T x)
+KFR_SINTRIN T isawtoothnorm(const T& x)
 {
     return T(-1) + 2 * fract(x + 0.5);
 }
 template <typename T>
-KFR_SINTRIN T isawtooth(T x)
+KFR_SINTRIN T isawtooth(const T& x)
 {
-    return intrinsics::isawtoothnorm(c_recip_pi<T, 1, 2> * x);
+    return intrinsics::isawtoothnorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_SINTRIN T rawtriangle(T x)
+KFR_SINTRIN T rawtriangle(const T& x)
 {
     return 1 - abs(4 * x - 2);
 }
 template <typename T>
-KFR_SINTRIN T trianglenorm(T x)
+KFR_SINTRIN T trianglenorm(const T& x)
 {
     return intrinsics::rawtriangle(fract(x + 0.25));
 }
 template <typename T>
-KFR_SINTRIN T triangle(T x)
+KFR_SINTRIN T triangle(const T& x)
 {
-    return intrinsics::trianglenorm(c_recip_pi<T, 1, 2> * x);
+    return intrinsics::trianglenorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 }
 KFR_I_FN(rawsine)
diff --git a/include/kfr/dsp/sample_rate_conversion.hpp b/include/kfr/dsp/sample_rate_conversion.hpp
@@ -297,13 +297,13 @@ struct expression_downsample<4, offset, E> : expression<E>
 }
 
 template <typename E1, size_t offset = 0>
-CMT_INLINE internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, csize_t<offset> = csize<0>)
+CMT_INLINE internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, csize_t<offset> = csize_t<0>())
 {
     return internal::expression_downsample<2, offset, E1>(std::forward<E1>(e1));
 }
 
 template <typename E1, size_t offset = 0>
-CMT_INLINE internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, csize_t<offset> = csize<0>)
+CMT_INLINE internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, csize_t<offset> = csize_t<0>())
 {
     return internal::expression_downsample<4, offset, E1>(std::forward<E1>(e1));
 }
diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp
@@ -37,28 +37,28 @@ using sample_rate_t = double;
 namespace intrinsics
 {
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF amp_to_dB(T amp)
+KFR_SINTRIN TF amp_to_dB(const T& amp)
 {
     return log(static_cast<TF>(amp)) * subtype<TF>(8.6858896380650365530225783783322);
     // return T( 20.0 ) * log10( level );
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF dB_to_amp(T dB)
+KFR_SINTRIN TF dB_to_amp(const T& dB)
 {
     return exp(dB * subtype<TF>(0.11512925464970228420089957273422));
     // return exp10( dB / 20 );
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF amp_to_dB(T amp, T offset)
+KFR_SINTRIN TF amp_to_dB(const T& amp, const T& offset)
 {
     return log_fmadd(amp, subtype<TF>(8.6858896380650365530225783783322), offset);
     // return T( 20.0 ) * log10( level );
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF dB_to_amp(T dB, T offset)
+KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset)
 {
     auto offs = -subtype<TF>(0.11512925464970228420089957273422) * offset;
     return exp_fmadd(dB, subtype<TF>(0.11512925464970228420089957273422), offs);
@@ -66,13 +66,13 @@ KFR_SINTRIN TF dB_to_amp(T dB, T offset)
 }
 
 template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout power_to_dB(T x)
+KFR_SINTRIN Tout power_to_dB(const T& x)
 {
     return log(x) * (10 * c_recip_log_10<Tout>);
 }
 
 template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout dB_to_power(T x)
+KFR_SINTRIN Tout dB_to_power(const T& x)
 {
     if (x == -c_infinity<Tout>)
         return 0.0;
@@ -81,7 +81,7 @@ KFR_SINTRIN Tout dB_to_power(T x)
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF note_to_hertz(T note)
+KFR_SINTRIN TF note_to_hertz(const T& note)
 {
     const subtype<TF> offset = 2.1011784386926213177653145771814;
 
@@ -89,7 +89,7 @@ KFR_SINTRIN TF note_to_hertz(T note)
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF hertz_to_note(T hertz)
+KFR_SINTRIN TF hertz_to_note(const T& hertz)
 {
     const subtype<TF> offset = -36.376316562295915248836189714583;
 
@@ -97,7 +97,7 @@ KFR_SINTRIN TF hertz_to_note(T hertz)
 }
 
 template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>>
-KFR_SINTRIN Tc note_to_hertz(T1 note, T2 tunenote, T3 tunehertz)
+KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz)
 {
     const Tc offset = log(tunehertz) - tunenote * subtype<Tc>(0.05776226504666210911810267678818);
 
@@ -105,7 +105,7 @@ KFR_SINTRIN Tc note_to_hertz(T1 note, T2 tunenote, T3 tunehertz)
 }
 
 template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>>
-KFR_SINTRIN Tc hertz_to_note(T1 hertz, T2 tunenote, T3 tunehertz)
+KFR_SINTRIN Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz)
 {
     const Tc offset = tunenote - log(tunehertz) * subtype<Tc>(17.312340490667560888319096172023);
 
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -604,10 +604,10 @@ CMT_NOINLINE expression_pointer<T> window(size_t size, window_type type, identit
                                           ctype_t<T>               = ctype_t<T>())
 {
     return cswitch(
-        cvals<window_type, window_type::rectangular, window_type::triangular, window_type::bartlett,
-              window_type::cosine, window_type::hann, window_type::bartlett_hann, window_type::hamming,
-              window_type::bohman, window_type::blackman, window_type::blackman_harris, window_type::kaiser,
-              window_type::flattop, window_type::gaussian, window_type::lanczos>,
+        cvals_t<window_type, window_type::rectangular, window_type::triangular, window_type::bartlett,
+                window_type::cosine, window_type::hann, window_type::bartlett_hann, window_type::hamming,
+                window_type::bohman, window_type::blackman, window_type::blackman_harris, window_type::kaiser,
+                window_type::flattop, window_type::gaussian, window_type::lanczos>(),
         type,
         [=](auto win) {
             constexpr window_type window = val_of(decltype(win)());
diff --git a/include/kfr/io/audiofile.hpp b/include/kfr/io/audiofile.hpp
@@ -44,7 +44,8 @@ void write_interleaved(E1&& dest, const univector2d<Tin, Tag1, Tag2>& src)
     }
     else if (channels == 2)
     {
-        process(std::forward<E1>(dest), pack(src[0], src[1]), 0, infinite_size, nullptr, nullptr, csize<2>);
+        process(std::forward<E1>(dest), pack(src[0], src[1]), 0, infinite_size, nullptr, nullptr,
+                csize_t<2>());
     }
     else
     {
@@ -89,7 +90,8 @@ constexpr range<fmax> audio_range<f64>()
 
 inline size_t get_audiobitdepth(audiodatatype type)
 {
-    return (size_t[]){ 0, 16, 24, 24, 32, 32, 64 }[static_cast<int>(type)];
+    constexpr size_t bits[]{ 0, 16, 24, 24, 32, 32, 64 };
+    return static_cast<size_t>(type) < 7 ? bits[static_cast<size_t>(type)] : 0;
 }
 
 template <typename T>
@@ -130,6 +132,24 @@ static constexpr u32 FourCC(const char (&ch)[5])
     return u32(u8(ch[0])) | u32(u8(ch[1])) << 8 | u32(u8(ch[2])) << 16 | u32(u8(ch[3])) << 24;
 }
 
+constexpr u32 cWAVE_FORMAT_PCM  = 1;
+constexpr u32 cWAVE_FORMAT_IEEE = 3;
+
+constexpr u32 ccRIFF = FourCC("RIFF");
+constexpr u32 ccWAVE = FourCC("WAVE");
+constexpr u32 ccfmt  = FourCC("fmt ");
+constexpr u32 ccdata = FourCC("data");
+
+constexpr u32 ccFORM = FourCC("FORM");
+constexpr u32 ccAIFF = FourCC("AIFF");
+constexpr u32 ccAIFC = FourCC("AIFC");
+constexpr u32 ccCOMM = FourCC("COMM");
+constexpr u32 ccSSND = FourCC("SSND");
+constexpr u32 ccNONE = FourCC("NONE");
+constexpr u32 ccsowt = FourCC("sowt");
+
+CMT_PRAGMA_PACK_PUSH_1
+
 struct WAV_FMT
 {
     i32 fId; // 'fmt '
@@ -140,20 +160,20 @@ struct WAV_FMT
     i32 nAvgBytesPerSec;
     i16 numBlockAlingn;
     i16 numBitsPerSample;
-} __attribute__((packed));
+} CMT_GNU_PACKED;
 
 struct WAV_DATA
 {
     i32 dId; // 'data' or 'fact'
     i32 dLen;
     u8 data[1];
-} __attribute__((packed));
+} CMT_GNU_PACKED;
 
 struct WAV_DATA_HDR
 {
     i32 dId; // 'data' or 'fact'
     i32 dLen;
-} __attribute__((packed));
+} CMT_GNU_PACKED;
 
 struct AIFF_FMT
 {
@@ -164,53 +184,39 @@ struct AIFF_FMT
     i16 bitsPerSample;
     f80 sampleRate;
     i32 compression;
-} __attribute__((packed));
+} CMT_GNU_PACKED;
 
 struct AIFF_DATA
 {
     i32 chunkID;
     i32 chunkLen;
     u32 offset;
-} __attribute__((packed));
-
-constexpr u32 cWAVE_FORMAT_PCM  = 1;
-constexpr u32 cWAVE_FORMAT_IEEE = 3;
-
-constexpr u32 ccRIFF = FourCC("RIFF");
-constexpr u32 ccWAVE = FourCC("WAVE");
-constexpr u32 ccfmt  = FourCC("fmt ");
-constexpr u32 ccdata = FourCC("data");
-
-constexpr u32 ccFORM = FourCC("FORM");
-constexpr u32 ccAIFF = FourCC("AIFF");
-constexpr u32 ccAIFC = FourCC("AIFC");
-constexpr u32 ccCOMM = FourCC("COMM");
-constexpr u32 ccSSND = FourCC("SSND");
-constexpr u32 ccNONE = FourCC("NONE");
-constexpr u32 ccsowt = FourCC("sowt");
+} CMT_GNU_PACKED;
 
 struct RIFF_HDR
 {
     i32 riffID; // 'RIFF' or 'COMM'
     i32 fileLen;
     i32 formatID; // 'WAVE' or 'AIFF'
-} __attribute__((packed));
+} CMT_GNU_PACKED;
 
 struct WAV_HEADER
 {
     RIFF_HDR riff;
     WAV_FMT fmt;
     WAV_DATA_HDR data;
-
-} __attribute__((packed));
+} CMT_GNU_PACKED;
 
 struct CHUNK_HDR
 {
     i32 chunkID;
     i32 chunkLen;
-} __attribute__((packed));
+} CMT_GNU_PACKED;
+
+CMT_PRAGMA_PACK_POP
 
-static bool audio_test_wav(const array_ref<u8>& rawbytes)
+template <size_t = 0>
+bool audio_test_wav(const array_ref<u8>& rawbytes)
 {
     if (rawbytes.size() < sizeof(RIFF_HDR))
     {
@@ -228,7 +234,8 @@ static bool audio_test_wav(const array_ref<u8>& rawbytes)
     return true;
 }
 
-static bool audio_test_aiff(const array_ref<u8>& rawbytes)
+template <size_t = 0>
+bool audio_test_aiff(const array_ref<u8>& rawbytes)
 {
     if (rawbytes.size() < sizeof(RIFF_HDR))
     {
@@ -255,7 +262,8 @@ enum class file_status
     unsupported_bit_format
 };
 
-static file_status audio_info_wav(audioformat& info, const array_ref<u8>& rawbytes)
+template <size_t = 0>
+file_status audio_info_wav(audioformat& info, const array_ref<u8>& rawbytes)
 {
     const CHUNK_HDR* chunk  = ptr_cast<CHUNK_HDR>(rawbytes.data() + 12);
     const void* end         = ptr_cast<char>(rawbytes.end());
@@ -324,7 +332,8 @@ static file_status audio_info_wav(audioformat& info, const array_ref<u8>& rawbyt
     return file_status::ok;
 }
 
-static file_status audio_info(audioformat& info, const array_ref<u8>& file_bytes)
+template <size_t = 0>
+file_status audio_info(audioformat& info, const array_ref<u8>& file_bytes)
 {
     if (audio_test_wav(file_bytes))
         return audio_info_wav(info, file_bytes);
diff --git a/include/kfr/io/python_plot.hpp b/include/kfr/io/python_plot.hpp
@@ -40,9 +40,9 @@ namespace kfr
 {
 namespace internal
 {
-#pragma clang diagnostic push
+CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wdeprecated-declarations")
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wdeprecated-declarations")
 #endif
 
 template <int = 0>
@@ -66,7 +66,7 @@ void python(const std::string& name, const std::string& code)
     fclose(f);
     std::system(("python \"" + filename + "\"").c_str());
 }
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 }
 
 inline std::string concat_args() { return {}; }
diff --git a/include/kfr/io/tostring.hpp b/include/kfr/io/tostring.hpp
@@ -111,6 +111,7 @@ inline std::string array_to_string(const kfr::complex<T>* source, size_t N)
 template <typename T>
 struct representation<kfr::complex<T>>
 {
+    using type = std::string;
     static std::string get(const kfr::complex<T>& value)
     {
         return as_string(value.real()) + " + " + as_string(value.imag()) + "j";
@@ -120,12 +121,14 @@ struct representation<kfr::complex<T>>
 template <>
 struct representation<kfr::cpu_t>
 {
+    using type = std::string;
     static std::string get(kfr::cpu_t value) { return kfr::cpu_name(value); }
 };
 
 template <typename T, size_t N>
 struct representation<kfr::vec<T, N>>
 {
+    using type = std::string;
     static std::string get(const kfr::vec<T, N>& value)
     {
         return details::array_to_string(value.data(), value.size());
@@ -135,6 +138,7 @@ struct representation<kfr::vec<T, N>>
 template <typename T, size_t Tag>
 struct representation<kfr::univector<T, Tag>>
 {
+    using type = std::string;
     static std::string get(const kfr::univector<T, Tag>& value)
     {
         return details::array_to_string(value.data(), value.size());
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -67,28 +67,33 @@ else ()
     set(EMULATOR "")
 endif ()
 
+#get_cmake_property(_variableNames VARIABLES)
+#foreach (_variableName ${_variableNames})
+#    message(STATUS "${_variableName}=${${_variableName}}")
+#endforeach()
+
 if (NOT IOS)
     enable_testing()
 
     add_test(NAME base_test
-            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/base_test)
+            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/base_test)
     add_test(NAME intrinsic_test
-            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/intrinsic_test)
+            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/intrinsic_test)
     add_test(NAME dsp_test
-            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/dsp_test)
+            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/dsp_test)
     if (MPFR_FOUND)
         add_test(NAME transcendental_test
-                COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/transcendental_test)
+                COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/transcendental_test)
     endif ()
     add_test(NAME complex_test
-            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/complex_test)
+            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/complex_test)
     add_test(NAME expression_test
-            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/expression_test)
+            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/expression_test)
 
     if (NOT ARM)
         add_test(NAME multiarch
-                COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/multiarch)
+                COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/multiarch)
     endif ()
     add_test(NAME dft_test
-            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/dft_test)
+            COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/dft_test)
 endif ()
diff --git a/tests/base_test.cpp b/tests/base_test.cpp
@@ -76,16 +76,17 @@ TEST(test_basic)
     CHECK(even(numbers2) == vec<int, 4>{ 100, 102, 104, 106 });
 
     // * The following command pairs are equivalent:
-    CHECK(permute(numbers1, elements<0, 2, 1, 3, 4, 6, 5, 7>) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
-    CHECK(permute(numbers1, elements<0, 2, 1, 3>) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+    CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+    CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
 
-    CHECK(shuffle(numbers1, numbers2, elements<0, 8, 2, 10, 4, 12, 6, 14>) ==
+    CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) ==
           vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
-    CHECK(shuffle(numbers1, numbers2, elements<0, 8>) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
+    CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
 
-    CHECK(blend(numbers1, numbers2, elements<0, 1, 1, 0, 1, 1, 0, 1>) ==
+    CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) ==
+          vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
+    CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) ==
           vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
-    CHECK(blend(numbers1, numbers2, elements<0, 1, 1>) == vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
 
     // * Transpose matrix:
     const auto sixteen = enumerate<float, 16>();
@@ -154,6 +155,9 @@ TEST(vec_zerovector)
 
 TEST(vec_allonesvector)
 {
+    CHECK(bitcast<u32>(constants<f32>::allones()) == 0xFFFFFFFFu);
+    CHECK(bitcast<u64>(constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull);
+
     CHECK(~allonesvector<f32, 3>() == f32x3{ 0, 0, 0 });
     CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 });
     CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 });
@@ -221,6 +225,10 @@ TEST(vec_is_convertible)
     static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, "");
     static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, "");
 
+    testo::assert_is_same<i32x4, common_type<i32x4>>();
+    testo::assert_is_same<u32x4, common_type<i32x4, u32x4>>();
+    testo::assert_is_same<f64x4, common_type<i32x4, u32x4, f64x4>>();
+
     CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f });
     CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 });
     CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 });
@@ -282,7 +290,7 @@ TEST(test_stat)
     }
 }
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
     return testo::run_all("", true);
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -9,7 +9,6 @@
 #include <kfr/io.hpp>
 
 using namespace kfr;
-using testo::assert_is_same;
 
 TEST(complex_vector)
 {
@@ -67,7 +66,7 @@ TEST(complex_math)
     CHECK(a - b == make_vector(c32{ -2, -2 }));
     CHECK(a * b == make_vector(c32{ -5, 10 }));
     CHECK(a * 2 == make_vector(c32{ 2, 4 }));
-    CHECK(a / b == make_vector(c32{ 0.44, 0.08 }));
+    CHECK(a / b == make_vector(c32{ 0.44f, 0.08f }));
     CHECK(-a == make_vector(c32{ -1, -2 }));
 
     CHECK(real(a) == make_vector(1.f));
@@ -86,26 +85,26 @@ TEST(complex_math)
     testo::epsilon<f32>() *= 5;
     testo::epsilon<f64>() *= 5;
 
-    CHECK(csin(c32{ 1.f, 1.f }) == c32{ 1.2984575814159773, 0.634963914784736 });
-    CHECK(ccos(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489, -0.9888977057628651 });
-    CHECK(csinh(c32{ 1.f, 1.f }) == c32{ 0.634963914784736, 1.2984575814159773 });
-    CHECK(ccosh(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489, 0.9888977057628651 });
+    CHECK(csin(c32{ 1.f, 1.f }) == c32{ 1.2984575814159773f, 0.634963914784736f });
+    CHECK(ccos(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489f, -0.9888977057628651f });
+    CHECK(csinh(c32{ 1.f, 1.f }) == c32{ 0.634963914784736f, 1.2984575814159773f });
+    CHECK(ccosh(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489f, 0.9888977057628651f });
 
-    CHECK(clog(c32{ 1.f, 1.f }) == c32{ 0.34657359027997264, 0.7853981633974483 });
-    CHECK(clog2(c32{ 1.f, 1.f }) == c32{ 0.5, 1.1330900354567983 });
-    CHECK(clog10(c32{ 1.f, 1.f }) == c32{ 0.15051499783199057, 0.3410940884604603 });
+    CHECK(clog(c32{ 1.f, 1.f }) == c32{ 0.34657359027997264f, 0.7853981633974483f });
+    CHECK(clog2(c32{ 1.f, 1.f }) == c32{ 0.5f, 1.1330900354567983f });
+    CHECK(clog10(c32{ 1.f, 1.f }) == c32{ 0.15051499783199057f, 0.3410940884604603f });
 
-    CHECK(cexp(c32{ 1.f, 1.f }) == c32{ 1.4686939399158849, 2.2873552871788423 });
-    CHECK(cexp2(c32{ 1.f, 1.f }) == c32{ 1.5384778027279442, 1.2779225526272695 });
-    CHECK(cexp10(c32{ 1.f, 1.f }) == c32{ -6.682015101903131, 7.439803369574931 });
+    CHECK(cexp(c32{ 1.f, 1.f }) == c32{ 1.4686939399158849f, 2.2873552871788423f });
+    CHECK(cexp2(c32{ 1.f, 1.f }) == c32{ 1.5384778027279442f, 1.2779225526272695f });
+    CHECK(cexp10(c32{ 1.f, 1.f }) == c32{ -6.682015101903131f, 7.439803369574931f });
 
 #ifdef KFR_NATIVE_F64
-    CHECK(csin(c64{ 1.f, 1.f }) == c64{ 1.2984575814159773, 0.634963914784736 });
-    CHECK(ccos(c64{ 1.f, 1.f }) == c64{ 0.8337300251311489, -0.9888977057628651 });
-    CHECK(csinh(c64{ 1.f, 1.f }) == c64{ 0.634963914784736, 1.2984575814159773 });
-    CHECK(ccosh(c64{ 1.f, 1.f }) == c64{ 0.8337300251311489, 0.9888977057628651 });
-    CHECK(clog(c64{ 1.f, 1.f }) == c64{ 0.34657359027997264, 0.7853981633974483 });
-    CHECK(cexp(c64{ 1.f, 1.f }) == c64{ 1.4686939399158849, 2.2873552871788423 });
+    CHECK(csin(c64{ 1.0, 1.0 }) == c64{ 1.2984575814159773, 0.634963914784736 });
+    CHECK(ccos(c64{ 1.0, 1.0 }) == c64{ 0.8337300251311489, -0.9888977057628651 });
+    CHECK(csinh(c64{ 1.0, 1.0 }) == c64{ 0.634963914784736, 1.2984575814159773 });
+    CHECK(ccosh(c64{ 1.0, 1.0 }) == c64{ 0.8337300251311489, 0.9888977057628651 });
+    CHECK(clog(c64{ 1.0, 1.0 }) == c64{ 0.34657359027997264, 0.7853981633974483 });
+    CHECK(cexp(c64{ 1.0, 1.0 }) == c64{ 1.4686939399158849, 2.2873552871788423 });
 #endif
 }
 
@@ -161,19 +160,18 @@ TEST(complex_function_expressions)
     CHECK(uv3[1] == 2.f);
     CHECK(uv3[2] == 8.f);
     CHECK(uv3[3] == 18.f);
-
-    assert_is_same<c32, value_type_of<decltype(uv2)>>();
-    assert_is_same<f32, value_type_of<decltype(uv3)>>();
-    assert_is_same<f32, value_type_of<decltype(real(uv2))>>();
+    testo::assert_is_same<c32, value_type_of<decltype(uv2)>>();
+    testo::assert_is_same<f32, value_type_of<decltype(uv3)>>();
+    testo::assert_is_same<f32, value_type_of<decltype(real(uv2))>>();
 }
 
 TEST(static_tests)
 {
 #ifdef CMT_ARCH_SSE2
-    static_assert(vector_width<f32, cpu_t::sse2> == 4, "");
-    static_assert(vector_width<c32, cpu_t::sse2> == 2, "");
-    static_assert(vector_width<i32, cpu_t::sse2> == 4, "");
-    static_assert(vector_width<complex<i32>, cpu_t::sse2> == 2, "");
+    static_assert(platform<f32, cpu_t::sse2>::vector_width == 4, "");
+    static_assert(platform<c32, cpu_t::sse2>::vector_width == 2, "");
+    static_assert(platform<i32, cpu_t::sse2>::vector_width == 4, "");
+    static_assert(platform<complex<i32>, cpu_t::sse2>::vector_width == 2, "");
 #endif
 
     static_assert(is_numeric<vec<complex<float>, 4>>::value, "");
@@ -184,26 +182,26 @@ TEST(static_tests)
     static_assert(vec<c32, 4>::size() == 4, "");
     static_assert(vec<f32, 4>::scalar_size() == 4, "");
     static_assert(vec<c32, 4>::scalar_size() == 8, "");
-    assert_is_same<subtype<complex<i32>>, i32>();
-    assert_is_same<vec<c32, 4>::value_type, c32>();
-    assert_is_same<vec<c32, 4>::scalar_type, f32>();
-    assert_is_same<vec<f32, 4>::value_type, f32>();
-    assert_is_same<vec<f32, 4>::scalar_type, f32>();
-    assert_is_same<vec<c32, 1>, decltype(make_vector(c32{ 0, 0 }))>();
-    assert_is_same<vec<c32, 2>, decltype(make_vector(c32{ 0, 0 }, 4))>();
-    assert_is_same<ftype<complex<i32>>, complex<f32>>();
-    assert_is_same<ftype<complex<i64>>, complex<f64>>();
-    assert_is_same<ftype<vec<complex<i32>, 4>>, vec<complex<f32>, 4>>();
-    assert_is_same<ftype<vec<complex<i64>, 8>>, vec<complex<f64>, 8>>();
-
-    assert_is_same<kfr::internal::arg<int>, kfr::internal::expression_scalar<int, 1>>();
-    assert_is_same<kfr::internal::arg<complex<int>>,
-                   kfr::internal::expression_scalar<kfr::complex<int>, 1>>();
-
-    assert_is_same<common_type<complex<int>, double>, complex<double>>();
+    testo::assert_is_same<subtype<complex<i32>>, i32>();
+    testo::assert_is_same<vec<c32, 4>::value_type, c32>();
+    testo::assert_is_same<vec<c32, 4>::scalar_type, f32>();
+    testo::assert_is_same<vec<f32, 4>::value_type, f32>();
+    testo::assert_is_same<vec<f32, 4>::scalar_type, f32>();
+    testo::assert_is_same<vec<c32, 1>, decltype(make_vector(c32{ 0, 0 }))>();
+    testo::assert_is_same<vec<c32, 2>, decltype(make_vector(c32{ 0, 0 }, 4))>();
+    testo::assert_is_same<ftype<complex<i32>>, complex<f32>>();
+    testo::assert_is_same<ftype<complex<i64>>, complex<f64>>();
+    testo::assert_is_same<ftype<vec<complex<i32>, 4>>, vec<complex<f32>, 4>>();
+    testo::assert_is_same<ftype<vec<complex<i64>, 8>>, vec<complex<f64>, 8>>();
+
+    testo::assert_is_same<kfr::internal::arg<int>, kfr::internal::expression_scalar<int, 1>>();
+    testo::assert_is_same<kfr::internal::arg<complex<int>>,
+                          kfr::internal::expression_scalar<kfr::complex<int>, 1>>();
+
+    testo::assert_is_same<common_type<complex<int>, double>, complex<double>>();
 }
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -92,7 +92,7 @@ TEST(fft_accuracy)
                   });
 }
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp
@@ -6,7 +6,6 @@
 
 #include "testo/testo.hpp"
 #include <kfr/base.hpp>
-#include <kfr/dft.hpp>
 #include <kfr/dsp.hpp>
 #include <kfr/io.hpp>
 
@@ -21,7 +20,7 @@ TEST(delay)
     CHECK(v2[2] == 101);
     CHECK(v2[19] == 118);
 
-    const univector<float, 33> v3 = delay(v1, csize<3>);
+    const univector<float, 33> v3 = delay(v1, csize_t<3>());
     CHECK(v3[0] == 0);
     CHECK(v3[1] == 0);
     CHECK(v3[2] == 0);
@@ -34,16 +33,16 @@ TEST(fracdelay)
 {
     univector<double, 5> a({ 1, 2, 3, 4, 5 });
     univector<double, 5> b = fracdelay(a, 0.5);
-    CHECK(rms(b - univector<double>({ 0.5, 1.5, 2.5, 3.5, 4.5 })) < c_epsilon<double> * 5);
+    CHECK(rms(b - univector<double>({ 0.5, 1.5, 2.5, 3.5, 4.5 })) < constants<double>::epsilon * 5);
 
     b = fracdelay(a, 0.1);
-    CHECK(rms(b - univector<double>({ 0.9, 1.9, 2.9, 3.9, 4.9 })) < c_epsilon<double> * 5);
+    CHECK(rms(b - univector<double>({ 0.9, 1.9, 2.9, 3.9, 4.9 })) < constants<double>::epsilon * 5);
 
     b = fracdelay(a, 0.0);
-    CHECK(rms(b - univector<double>({ 1, 2, 3, 4, 5 })) < c_epsilon<double> * 5);
+    CHECK(rms(b - univector<double>({ 1, 2, 3, 4, 5 })) < constants<double>::epsilon * 5);
 
     b = fracdelay(a, 1.0);
-    CHECK(rms(b - univector<double>({ 0, 1, 2, 3, 4 })) < c_epsilon<double> * 5);
+    CHECK(rms(b - univector<double>({ 0, 1, 2, 3, 4 })) < constants<double>::epsilon * 5);
 }
 
 TEST(mixdown)
@@ -80,7 +79,7 @@ TEST(phasor)
     CHECK(rms(v1 - v2) < 1.e-5);
 }
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
     return testo::run_all("", true);
diff --git a/tests/empty_test.cpp b/tests/empty_test.cpp
@@ -2,4 +2,4 @@
 
 using namespace kfr;
 
-int main(int argc, char** argv) { return 0; }
+int main() {}
diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp
@@ -166,7 +166,7 @@ TEST(partition)
     }
 }
 
-int main(int argc, char** argv)
+int main()
 {
     println(library_version());
 
diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp
@@ -70,7 +70,7 @@ inline T ref_abs(T x)
 template <typename T>
 bool builtin_add_overflow(T x, T y, T* r)
 {
-#if __has_builtin(__builtin_add_overflow)
+#if CMT_HAS_BUILTIN(__builtin_add_overflow) || defined CMT_COMPILER_GCC
     return __builtin_add_overflow(x, y, r);
 #else
     *r = x + y;
@@ -80,17 +80,27 @@ bool builtin_add_overflow(T x, T y, T* r)
 template <>
 bool builtin_add_overflow<u64>(u64 x, u64 y, u64* r)
 {
+#if CMT_HAS_BUILTIN(__builtin_uaddll_overflow) || defined CMT_COMPILER_GCC
     return __builtin_uaddll_overflow(x, y, reinterpret_cast<unsigned long long*>(r));
+#else
+    *r = x + y;
+    return x > 0xFFFFFFFFFFFFFFFFull - y;
+#endif
 }
 template <>
 bool builtin_add_overflow<i64>(i64 x, i64 y, i64* r)
 {
+#if CMT_HAS_BUILTIN(__builtin_saddll_overflow) || defined CMT_COMPILER_GCC
     return __builtin_saddll_overflow(x, y, reinterpret_cast<long long*>(r));
+#else
+    *r = x + y;
+    return !((x ^ y) & 0x8000000000000000ull) && ((*r ^ x) & 0x8000000000000000ull);
+#endif
 }
 template <typename T>
 bool builtin_sub_overflow(T x, T y, T* r)
 {
-#if __has_builtin(__builtin_sub_overflow)
+#if CMT_HAS_BUILTIN(__builtin_sub_overflow) || defined CMT_COMPILER_GCC
     return __builtin_sub_overflow(x, y, r);
 #else
     *r = x - y;
@@ -100,12 +110,22 @@ bool builtin_sub_overflow(T x, T y, T* r)
 template <>
 bool builtin_sub_overflow<u64>(u64 x, u64 y, u64* r)
 {
+#if CMT_HAS_BUILTIN(__builtin_usubll_overflow) || defined CMT_COMPILER_GCC
     return __builtin_usubll_overflow(x, y, reinterpret_cast<unsigned long long*>(r));
+#else
+    *r = x - y;
+    return x < y;
+#endif
 }
 template <>
 bool builtin_sub_overflow<i64>(i64 x, i64 y, i64* r)
 {
+#if CMT_HAS_BUILTIN(__builtin_ssubll_overflow) || defined CMT_COMPILER_GCC
     return __builtin_ssubll_overflow(x, y, reinterpret_cast<long long*>(r));
+#else
+    *r = x - y;
+    return ((x ^ y) & 0x8000000000000000ull) && ((*r ^ x) & 0x8000000000000000ull);
+#endif
 }
 //#endif
 template <typename T>
@@ -152,7 +172,7 @@ TEST(intrin_abs)
                       using T    = type_of<decltype(type)>;
                       using Tsub = subtype<T>;
                       const T x(value);
-                      CHECK(kfr::abs(x) == apply([](auto x) { return Tsub(std::abs(x)); }, x));
+                      CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x));
                   });
 }
 
@@ -389,4 +409,4 @@ TEST(intrin_math)
     CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233)));
 }
 
-int main(int argc, char** argv) { return testo::run_all("", false); }
+int main() { return testo::run_all("", false); }
diff --git a/tests/multiarch.cpp b/tests/multiarch.cpp
@@ -54,4 +54,4 @@ TEST(test_fir_avx)
     }
 }
 
-int main(int argc, char** argv) { return testo::run_all("", true); }
+int main() { return testo::run_all("", true); }
diff --git a/tests/testo/testo.hpp b/tests/testo/testo.hpp
@@ -99,7 +99,7 @@ struct comparison
 
     comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)) {}
 
-    bool operator()() { return cmp(left, right); }
+    bool operator()() const { return cmp(left, right); }
 };
 
 template <typename Left, typename Right>
@@ -120,8 +120,8 @@ struct equality_comparer
     bool operator()(const L& l, const R& r) const { return l == r; }
 };
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
 
 template <typename T>
 inline T& epsilon()
@@ -149,7 +149,7 @@ struct equality_comparer<long double, long double>
     }
 };
 
-#pragma clang diagnostic pop
+CMT_PRAGMA_GNU(GCC diagnostic pop)
 
 template <typename L, typename R>
 struct equality_comparer<L, R, void_t<enable_if<!compound_type_traits<L>::is_scalar>>>
@@ -179,7 +179,7 @@ struct cmp_eq
     static const char* op() { return "=="; }
 
     template <typename L, typename R>
-    bool operator()(L&& left, R&& right)
+    bool operator()(L&& left, R&& right) const
     {
         equality_comparer<decay<L>, decay<R>> eq;
         return eq(left, right);
@@ -191,7 +191,7 @@ struct cmp_ne
     static const char* op() { return "!="; }
 
     template <typename L, typename R>
-    bool operator()(L&& left, R&& right)
+    bool operator()(L&& left, R&& right) const
     {
         return !cmp_eq()(left, right);
     }
@@ -202,7 +202,7 @@ struct cmp_lt
     static const char* op() { return "<"; }
 
     template <typename L, typename R>
-    bool operator()(L&& left, R&& right)
+    bool operator()(L&& left, R&& right) const
     {
         return left < right;
     }
@@ -213,7 +213,7 @@ struct cmp_gt
     static const char* op() { return ">"; }
 
     template <typename L, typename R>
-    bool operator()(L&& left, R&& right)
+    bool operator()(L&& left, R&& right) const
     {
         return left > right;
     }
@@ -224,7 +224,7 @@ struct cmp_le
     static const char* op() { return "<="; }
 
     template <typename L, typename R>
-    bool operator()(L&& left, R&& right)
+    bool operator()(L&& left, R&& right) const
     {
         return left <= right;
     }
@@ -235,7 +235,7 @@ struct cmp_ge
     static const char* op() { return ">="; }
 
     template <typename L, typename R>
-    bool operator()(L&& left, R&& right)
+    bool operator()(L&& left, R&& right) const
     {
         return left >= right;
     }
@@ -394,14 +394,14 @@ struct test_case
     }
 
     template <typename Op, typename L, typename R>
-    void check(comparison<Op, L, R> comparison, const char* expr)
+    void check(const comparison<Op, L, R>& comparison, const char* expr)
     {
         bool result = comparison();
         check(result, as_string(comparison.left, " ", Op::op(), " ", comparison.right), expr);
     }
 
     template <typename L>
-    void check(half_comparison<L> comparison, const char* expr)
+    void check(const half_comparison<L>& comparison, const char* expr)
     {
         bool result = comparison.left ? true : false;
         check(result, as_string(comparison.left), expr);
diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp
@@ -26,7 +26,8 @@ double ulps(T test, const mpfr::number& ref)
 
 TEST(test_sin_cos)
 {
-    testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, +c_pi<f64, 2>, 0.05),
+    testo::matrix(named("type")  = ctypes_t<f32, f64>(),
+                  named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
@@ -37,7 +38,7 @@ TEST(test_sin_cos)
 
 TEST(test_log)
 {
-    testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, 100.0, 0.5),
+    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
@@ -47,7 +48,7 @@ TEST(test_log)
 
 TEST(test_log2)
 {
-    testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, 100.0, 0.5),
+    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
@@ -57,7 +58,7 @@ TEST(test_log2)
 
 TEST(test_log10)
 {
-    testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, 100.0, 0.5),
+    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
@@ -67,7 +68,7 @@ TEST(test_log10)
 
 TEST(test_exp)
 {
-    testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(-10, +10, 0.05),
+    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
@@ -77,7 +78,7 @@ TEST(test_exp)
 
 TEST(test_exp2)
 {
-    testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(-10, +10, 0.05),
+    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
@@ -87,7 +88,7 @@ TEST(test_exp2)
 
 TEST(test_exp10)
 {
-    testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(-10, +10, 0.05),
+    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
                   [](auto type, double value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
@@ -95,7 +96,7 @@ TEST(test_exp10)
                   });
 }
 
-int main(int argc, char** argv)
+int main()
 {
     mpfr::scoped_precision p(128);
     return testo::run_all("");

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	CMakeLists.txt	\|	3	+++
M	examples/biquads.cpp	\|	2	+-
M	examples/dft.cpp	\|	4	+---
M	examples/fir.cpp	\|	2	+-
M	examples/sample_rate_conversion.cpp	\|	2	+-
M	examples/window.cpp	\|	2	+-
M	include/kfr/base/abs.hpp	\|	8	++++----
M	include/kfr/base/atan.hpp	\|	6	++++--
M	include/kfr/base/basic_expressions.hpp	\|	22	+++++++++++-----------
M	include/kfr/base/complex.hpp	\|	40	+++++++++++++++++++++++++++++++++++-----
M	include/kfr/base/constants.hpp	\|	258	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
M	include/kfr/base/cpuid.hpp	\|	18	+++++++++++++++++-
M	include/kfr/base/digitreverse.hpp	\|	16	++++++++++++----
M	include/kfr/base/expression.hpp	\|	11	++++++-----
M	include/kfr/base/function.hpp	\|	59	++++++++++++++++++++++++++++-------------------------------
M	include/kfr/base/gamma.hpp	\|	6	+++---
M	include/kfr/base/generators.hpp	\|	14	+++++++-------
M	include/kfr/base/kfr.h	\|	15	+++++++++------
M	include/kfr/base/log_exp.hpp	\|	26	+++++++++++++-------------
M	include/kfr/base/logical.hpp	\|	16	++++++++--------
M	include/kfr/base/memory.hpp	\|	11	++++++++---
M	include/kfr/base/modzerobessel.hpp	\|	89	+++++++++++++++++++++++++++++++++++++++----------------------------------------
M	include/kfr/base/operators.hpp	\|	16	++++++++--------
M	include/kfr/base/platform.hpp	\|	113	+++++++++++++++++++++++++++++++++++--------------------------------------------
M	include/kfr/base/pointer.hpp	\|	7	++++---
M	include/kfr/base/random.hpp	\|	18	++++++++++++------
M	include/kfr/base/read_write.hpp	\|	74	+++++++++++++++++++++++++++++++++++++-------------------------------------
M	include/kfr/base/reduce.hpp	\|	50	+++++++++++++++++++++++++-------------------------
M	include/kfr/base/select.hpp	\|	8	++++----
M	include/kfr/base/shuffle.hpp	\|	30	+++++++++++++-----------------
M	include/kfr/base/simd.hpp	\|	87	++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M	include/kfr/base/sin_cos.hpp	\|	92	+++++++++++++++++++++++++++++++++++--------------------------------------------
M	include/kfr/base/sort.hpp	\|	4	++--
M	include/kfr/base/tan.hpp	\|	68	++++++++++++++++++++++++++++++++++----------------------------------
M	include/kfr/base/types.hpp	\|	62	+++++++++++++-------------------------------------------------
M	include/kfr/base/univector.hpp	\|	14	++++++++++----
M	include/kfr/base/vec.hpp	\|	210	++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M	include/kfr/cident.h	\|	50	++++++++++++++++++++++++++++++++++++++++++++++++--
M	include/kfr/cometa.hpp	\|	262	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	include/kfr/cometa/array.hpp	\|	5	+++++
M	include/kfr/cometa/cstring.hpp	\|	14	+++++++-------
M	include/kfr/cometa/ctti.hpp	\|	6	+++---
M	include/kfr/cometa/function.hpp	\|	2	+-
M	include/kfr/cometa/named_arg.hpp	\|	5	+++++
M	include/kfr/cometa/string.hpp	\|	71	+++++++++++++++++++++++++++++++++++++++++------------------------------
M	include/kfr/data/sincos.hpp	\|	220	+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M	include/kfr/dft/bitrev.hpp	\|	24	++++++++++++------------
M	include/kfr/dft/cache.hpp	\|	17	++++++++++-------
M	include/kfr/dft/conv.hpp	\|	10	+++++-----
M	include/kfr/dft/fft.hpp	\|	310	++++++++++++++++++++++++++++++++++++++++---------------------------------------
M	include/kfr/dft/ft.hpp	\|	371	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	include/kfr/dsp/biquad.hpp	\|	6	+++---
M	include/kfr/dsp/delay.hpp	\|	2	+-
M	include/kfr/dsp/fir.hpp	\|	2	+-
M	include/kfr/dsp/mixdown.hpp	\|	14	+++++++++++---
M	include/kfr/dsp/oscillators.hpp	\|	43	++++++++++++++++++++++---------------------
M	include/kfr/dsp/sample_rate_conversion.hpp	\|	4	++--
M	include/kfr/dsp/units.hpp	\|	20	++++++++++----------
M	include/kfr/dsp/window.hpp	\|	8	++++----
M	include/kfr/io/audiofile.hpp	\|	71	++++++++++++++++++++++++++++++++++++++++-------------------------------
M	include/kfr/io/python_plot.hpp	\|	6	+++---
M	include/kfr/io/tostring.hpp	\|	4	++++
M	tests/CMakeLists.txt	\|	21	+++++++++++++--------
M	tests/base_test.cpp	\|	22	+++++++++++++++-------
M	tests/complex_test.cpp	\|	86	+++++++++++++++++++++++++++++++++++++++----------------------------------------
M	tests/dft_test.cpp	\|	2	+-
M	tests/dsp_test.cpp	\|	13	++++++-------
M	tests/empty_test.cpp	\|	2	+-
M	tests/expression_test.cpp	\|	2	+-
M	tests/intrinsic_test.cpp	\|	28	++++++++++++++++++++++++----
M	tests/multiarch.cpp	\|	2	+-
M	tests/testo/testo.hpp	\|	24	++++++++++++------------
M	tests/transcendental_test.cpp	\|	17	+++++++++--------