Merge remote-tracking branch 'origin/dev' - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit b063114b7bb195a28a3d6e40a9ba203594891523
parent b849acc08979dc137dd03dddc285fc51da65decd
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue,  9 Aug 2016 08:58:46 +0300

Merge remote-tracking branch 'origin/dev'

Diffstat:
M .travis.yml  | 13 +++----------
M CMakeLists.txt  | 20 ++++++++++++++++----
M examples/CMakeLists.txt  | 6 +++---
M examples/biquads.cpp  | 34 +++++++++++++++++-----------------
M examples/dft.cpp  | 9 ++++-----
M examples/fir.cpp  | 12 ++++++------
D examples/resampling.cpp  | 105 -------------------------------------------------------------------------------
A examples/sample_rate_conversion.cpp  | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M examples/window.cpp  | 2 +-
M include/kfr/all.hpp  | 74 ++++----------------------------------------------------------------------
A include/kfr/base.hpp  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/base/abs.hpp  | 39 ++++++++++++++++++++++++++++++++++++---
M include/kfr/base/asin_acos.hpp  | 4 ++--
M include/kfr/base/atan.hpp  | 12 ++++++------
M include/kfr/base/basic_expressions.hpp  | 53 +++++++++++++++++++++++++++--------------------------
M include/kfr/base/complex.hpp  | 98 +++++++++++++++++++++++++++++++++++++++++--------------------------------------
M include/kfr/base/conversion.hpp  | 6 +++---
M include/kfr/base/cpuid.hpp  | 30 ++++++++++++++----------------
M include/kfr/base/cpuid_auto.hpp  | 8 ++++----
M include/kfr/base/digitreverse.hpp  | 6 +++---
M include/kfr/base/expression.hpp  | 63 +++++++++++++++++++++++++++++++--------------------------------
M include/kfr/base/function.hpp  | 6 +++---
M include/kfr/base/gamma.hpp  | 4 ++--
M include/kfr/base/generators.hpp  | 40 ++++++++++++++++++++--------------------
M include/kfr/base/intrinsics.h  | 6 +++---
M include/kfr/base/kfr.h  | 103 ++-----------------------------------------------------------------------------
M include/kfr/base/log_exp.hpp  | 2 +-
M include/kfr/base/logical.hpp  | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M include/kfr/base/memory.hpp  | 22 +++++++++++-----------
M include/kfr/base/min_max.hpp  | 39 +++++++++++++++++++++++++++++++++++----
M include/kfr/base/modzerobessel.hpp  | 6 +++---
M include/kfr/base/operators.hpp  | 206 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M include/kfr/base/pointer.hpp  | 42 ++++++++++++++++++++++--------------------
M include/kfr/base/random.hpp  | 8 ++++----
M include/kfr/base/read_write.hpp  | 34 +++++++++++++++++-----------------
M include/kfr/base/reduce.hpp  | 18 +++++++++---------
M include/kfr/base/round.hpp  | 24 ++++++++++++++++++++++--
M include/kfr/base/saturation.hpp  | 31 +++++++++++++++++++++++++++++--
M include/kfr/base/select.hpp  | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M include/kfr/base/shuffle.hpp  | 127 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M include/kfr/base/sin_cos.hpp  | 2 +-
M include/kfr/base/sort.hpp  | 4 ++--
M include/kfr/base/sqrt.hpp  | 4 ++--
M include/kfr/base/types.hpp  | 94 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M include/kfr/base/univector.hpp  | 34 +++++++++++++++++-----------------
M include/kfr/base/vec.hpp  | 563 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M include/kfr/cident.h  | 532 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M include/kfr/cometa.hpp  | 164 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M include/kfr/cometa/string.hpp  | 120 ++++++++++++++++++++++++++++++++++++++++----------------------------------------
A include/kfr/dft.hpp  | 31 +++++++++++++++++++++++++++++++
M include/kfr/dft/bitrev.hpp  | 6 +++---
M include/kfr/dft/conv.hpp  | 2 +-
M include/kfr/dft/fft.hpp  | 54 +++++++++++++++++++++++++++---------------------------
M include/kfr/dft/ft.hpp  | 124 +++++++++++++++++++++++++++++++++++++++----------------------------------------
A include/kfr/dsp.hpp  | 43 +++++++++++++++++++++++++++++++++++++++++++
M include/kfr/dsp/biquad.hpp  | 144 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
M include/kfr/dsp/biquad_design.hpp  | 32 ++++++++++++++++----------------
A include/kfr/dsp/dcremove.hpp  | 37 +++++++++++++++++++++++++++++++++++++
M include/kfr/dsp/fir.hpp  | 18 +++++++++++-------
M include/kfr/dsp/fir_design.hpp  | 8 ++++----
M include/kfr/dsp/fracdelay.hpp  | 4 +---
M include/kfr/dsp/goertzel.hpp  | 147 +++++++++++++++++++++++++++++++++++++------------------------------------------
M include/kfr/dsp/interpolation.hpp  | 4 ++--
A include/kfr/dsp/mixdown.hpp  | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/dsp/resample.hpp  | 188 +------------------------------------------------------------------------------
A include/kfr/dsp/sample_rate_conversion.hpp  | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/dsp/units.hpp  | 2 +-
M include/kfr/dsp/waveshaper.hpp  | 7 +++----
M include/kfr/dsp/window.hpp  | 91 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
A include/kfr/io.hpp  | 30 ++++++++++++++++++++++++++++++
M include/kfr/io/file.hpp  | 4 ++--
M include/kfr/io/python_plot.hpp  | 9 +++++++--
M include/kfr/io/tostring.hpp  | 6 +++++-
M include/kfr/math.hpp  | 23 +----------------------
M include/kfr/version.hpp  | 2 +-
M sources.cmake  | 75 +++++++++++++++++++++++++++++++++------------------------------------------
M tests/CMakeLists.txt  | 71 ++++++++++++++++++++++++++++++++++++++++++++---------------------------
M tests/complex_test.cpp  | 17 ++++++++++++++---
M tests/conv_test.cpp  | 8 ++++----
M tests/dft_test.cpp  | 8 +++++++-
M tests/empty_test.cpp  | 2 +-
M tests/intrinsic_test.cpp  | 82 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
A tests/multiarch.cpp  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/multiarch_fir_avx.cpp  | 18 ++++++++++++++++++
A tests/multiarch_fir_sse2.cpp  | 18 ++++++++++++++++++
M tests/testo/testo.hpp  | 22 +++++++++++-----------
M tests/vec_test.cpp  | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
A update-sources.py  | 31 +++++++++++++++++++++++++++++++

88 files changed, 2861 insertions(+), 2061 deletions(-)
diff --git a/.travis.yml b/.travis.yml
@@ -4,17 +4,13 @@ matrix:
   include:
     - os: linux
       compiler: clang
-      sudo: required
-      dist: precise
+      sudo: false
       addons:
         apt:
           sources:
             - ubuntu-toolchain-r-test
             - llvm-toolchain-precise-3.8
-            - george-edison55-precise-backports
           packages:
-            - cmake
-            - cmake-data
             - g++-5
             - clang-3.8
             - libmpfr-dev
@@ -25,14 +21,11 @@ matrix:
     - os: osx
       osx_image: xcode7.3
     - os: osx
-      osx_image: xcode7.2
-    - os: osx
-      osx_image: xcode7.1
-    - os: osx
       osx_image: xcode7
+    - os: osx
+      osx_image: beta-xcode6.3
 
 before_install:
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get update -qq ; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update ; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew reinstall cmake ; fi
   - cmake --version
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,21 +1,27 @@
 # Copyright (C) 2016 D Levin (http://www.kfrlib.com)
 # This file is part of KFR
-# 
+#
 # KFR is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # KFR is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 2.8)
+
+if (CMAKE_VERSION VERSION_LESS "2.8.12")
+    function(add_compile_options)
+        add_definitions(${ARGN})
+    endfunction(add_compile_options)
+endif ()
 
 set(OPT_BITNESS "") # cmake -DOPT_BITNESS="-m32" or -m64
 set(OPT_STATIC "") # cmake -DOPT_STATIC="-static"
@@ -46,6 +52,12 @@ include(sources.cmake)
 
 set(ALL_WARNINGS -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-c99-extensions -Wno-padded)
 
+if (IOS)
+    set(STD_LIB)
+else ()
+    set(STD_LIB stdc++)
+endif ()
+
 if (NOT MSVC)
     add_compile_options(-std=c++1y)
 else ()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -15,7 +15,7 @@
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 2.8)
 
 if (NOT MSVC)
     add_compile_options(-fno-exceptions -fno-rtti)
@@ -24,7 +24,7 @@ if (NOT MSVC)
     else ()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_FLAGS}")
     endif ()
-    link_libraries(stdc++ pthread)
+    link_libraries(${STD_LIB} pthread)
 else ()
     add_compile_options(/arch:AVX)
 endif ()
@@ -34,5 +34,5 @@ include_directories(../include)
 add_executable(biquads biquads.cpp ${KFR_SRC})
 add_executable(window window.cpp ${KFR_SRC})
 add_executable(fir fir.cpp ${KFR_SRC})
-add_executable(resampling resampling.cpp ${KFR_SRC})
+add_executable(sample_rate_conversion sample_rate_conversion.cpp ${KFR_SRC})
 add_executable(dft dft.cpp ${KFR_SRC} ${DFT_SRC})
diff --git a/examples/biquads.cpp b/examples/biquads.cpp
@@ -32,53 +32,53 @@ int main(int argc, char** argv)
 
     const std::string options = "phaseresp=True";
 
-    univector<double, 128> output;
+    univector<fbase, 128> output;
     {
-        biquad_params<double> bq[] = { biquad_notch(0.1, 0.5), biquad_notch(0.2, 0.5), biquad_notch(0.3, 0.5),
-                                       biquad_notch(0.4, 0.5) };
+        biquad_params<fbase> bq[] = { biquad_notch(0.1, 0.5), biquad_notch(0.2, 0.5), biquad_notch(0.3, 0.5),
+                                      biquad_notch(0.4, 0.5) };
         output = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_notch", output, options + ", title='Four Biquad Notch filters'");
 
     {
-        biquad_params<double> bq[] = { biquad_lowpass(0.2, 0.9) };
-        output                     = biquad(bq, simpleimpulse());
+        biquad_params<fbase> bq[] = { biquad_lowpass(0.2, 0.9) };
+        output                    = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_lowpass", output, options + ", title='Biquad Low pass filter (0.2, 0.9)'");
 
     {
-        biquad_params<double> bq[] = { biquad_highpass(0.3, 0.1) };
-        output                     = biquad(bq, simpleimpulse());
+        biquad_params<fbase> bq[] = { biquad_highpass(0.3, 0.1) };
+        output                    = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_highpass", output, options + ", title='Biquad High pass filter (0.3, 0.1)'");
 
     {
-        biquad_params<double> bq[] = { biquad_peak(0.3, 0.5, +9.0) };
-        output                     = biquad(bq, simpleimpulse());
+        biquad_params<fbase> bq[] = { biquad_peak(0.3, 0.5, +9.0) };
+        output                    = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_peak", output, options + ", title='Biquad Peak filter (0.2, 0.5, +9)'");
 
     {
-        biquad_params<double> bq[] = { biquad_peak(0.3, 3.0, -2.0) };
-        output                     = biquad(bq, simpleimpulse());
+        biquad_params<fbase> bq[] = { biquad_peak(0.3, 3.0, -2.0) };
+        output                    = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_peak2", output, options + ", title='Biquad Peak filter (0.3, 3, -2)'");
 
     {
-        biquad_params<double> bq[] = { biquad_lowshelf(0.3, -1.0) };
-        output                     = biquad(bq, simpleimpulse());
+        biquad_params<fbase> bq[] = { biquad_lowshelf(0.3, -1.0) };
+        output                    = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_lowshelf", output, options + ", title='Biquad low shelf filter (0.3, -1)'");
 
     {
-        biquad_params<double> bq[] = { biquad_highshelf(0.3, +9.0) };
-        output                     = biquad(bq, simpleimpulse());
+        biquad_params<fbase> bq[] = { biquad_highshelf(0.3, +9.0) };
+        output                    = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_highshelf", output, options + ", title='Biquad high shelf filter (0.3, +9)'");
 
     {
-        biquad_params<double> bq[] = { biquad_bandpass(0.25, 0.2) };
-        output                     = biquad(bq, simpleimpulse());
+        biquad_params<fbase> bq[] = { biquad_bandpass(0.25, 0.2) };
+        output                    = biquad(bq, simpleimpulse());
     }
     plot_save("biquad_bandpass", output, options + ", title='Biquad band pass (0.25, 0.2)'");
 
diff --git a/examples/dft.cpp b/examples/dft.cpp
@@ -29,14 +29,13 @@ int main(int argc, char** argv)
 
     // fft size
     const size_t size = 128;
-    using float_type  = double;
 
     // initialize input & output buffers
-    univector<complex<float_type>, size> in  = sin(linspace(0.0, c_pi<float_type, 2> * 4.0, size));
-    univector<complex<float_type>, size> out = scalar(qnan);
+    univector<complex<fbase>, size> in  = sin(linspace(0.0, c_pi<fbase, 2> * 4.0, size));
+    univector<complex<fbase>, size> out = scalar(qnan);
 
     // initialize fft
-    const dft_plan<float_type> dft(size);
+    const dft_plan<fbase> dft(size);
 
     // allocate work buffer for fft (if needed)
     univector<u8> temp(dft.temp_size);
@@ -48,7 +47,7 @@ int main(int argc, char** argv)
     out = out / size;
 
     // get magnitude and convert to decibels
-    univector<float_type, size> dB = amp_to_dB(cabs(out));
+    univector<fbase, size> dB = amp_to_dB(cabs(out));
 
     println("max  = ", maxof(dB));
     println("min  = ", minof(dB));
diff --git a/examples/fir.cpp b/examples/fir.cpp
@@ -37,15 +37,15 @@ int main(int argc, char** argv)
 
     const std::string options = "phaseresp=False";
 
-    univector<double, 15> taps15;
-    univector<double, 127> taps127;
-    univector<double, 8191> taps8191;
+    univector<fbase, 15> taps15;
+    univector<fbase, 127> taps127;
+    univector<fbase, 8191> taps8191;
 
-    expression_pointer<double> hann = to_pointer(window_hann(taps15.size()));
+    expression_pointer<fbase> hann = to_pointer(window_hann(taps15.size()));
 
-    expression_pointer<double> kaiser = to_pointer(window_kaiser(taps127.size(), 3.0));
+    expression_pointer<fbase> kaiser = to_pointer(window_kaiser(taps127.size(), 3.0));
 
-    expression_pointer<double> blackman_harris = to_pointer(window_blackman_harris(taps8191.size()));
+    expression_pointer<fbase> blackman_harris = to_pointer(window_blackman_harris(taps8191.size()));
 
     fir_lowpass(taps15, 0.15, hann, true);
     plot_save("fir_lowpass_hann", taps15, options + ", title='15-point lowpass FIR, Hann window'");
diff --git a/examples/resampling.cpp b/examples/resampling.cpp
@@ -1,105 +0,0 @@
-/**
- * KFR (http://kfrlib.com)
- * Copyright (C) 2016  D Levin
- * See LICENSE.txt for details
- */
-
-// library_version()
-#include <kfr/version.hpp>
-
-// print(), format()
-#include <kfr/cometa/string.hpp>
-
-#include <kfr/math.hpp>
-
-// resample*
-#include <kfr/dsp/resample.hpp>
-
-// file*
-#include <kfr/io/audiofile.hpp>
-
-// swept
-#include <kfr/dsp/oscillators.hpp>
-
-// plot_save()
-#include <kfr/io/python_plot.hpp>
-
-#include <iostream>
-
-using namespace kfr;
-
-constexpr size_t input_sr  = 96000;
-constexpr size_t output_sr = 44100;
-constexpr size_t len       = 96000 * 6;
-constexpr f64 i32max       = 2147483647.0;
-
-int main(int argc, char** argv)
-{
-    println(library_version());
-
-    const std::string options = "phaseresp=False";
-
-    univector<f64> swept_sine = swept(0.5, len);
-
-    {
-        auto r = resampler(resample_quality::high, output_sr, input_sr, 1.0, 0.496);
-        univector<f64> resampled(len * output_sr / input_sr);
-
-        const size_t destsize = r(resampled.data(), swept_sine);
-
-        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
-        univector2d<i32> data  = { i32data };
-
-        auto wr = sequential_file_writer("audio_high_quality.wav");
-        audio_encode(wr, data, audioformat(data, output_sr));
-
-        plot_save("audio_high_quality", "audio_high_quality.wav", "");
-    }
-
-    {
-        auto r = resampler(resample_quality::normal, output_sr, input_sr, 1.0, 0.496);
-        univector<f64> resampled(len * output_sr / input_sr);
-
-        const size_t destsize = r(resampled.data(), swept_sine);
-
-        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
-        univector2d<i32> data  = { i32data };
-
-        auto wr = sequential_file_writer("audio_normal_quality.wav");
-        audio_encode(wr, data, audioformat(data, output_sr));
-
-        plot_save("audio_normal_quality", "audio_normal_quality.wav", "");
-    }
-
-    {
-        auto r = resampler(resample_quality::low, output_sr, input_sr, 1.0, 0.496);
-        univector<f64> resampled(len * output_sr / input_sr);
-
-        const size_t destsize = r(resampled.data(), swept_sine);
-
-        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
-        univector2d<i32> data  = { i32data };
-
-        auto wr = sequential_file_writer("audio_low_quality.wav");
-        audio_encode(wr, data, audioformat(data, output_sr));
-
-        plot_save("audio_low_quality", "audio_low_quality.wav", "");
-    }
-
-    {
-        auto r = resampler(resample_quality::draft, output_sr, input_sr, 1.0, 0.496);
-        univector<f64> resampled(len * output_sr / input_sr);
-
-        const size_t destsize = r(resampled.data(), swept_sine);
-
-        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
-        univector2d<i32> data  = { i32data };
-
-        auto wr = sequential_file_writer("audio_draft_quality.wav");
-        audio_encode(wr, data, audioformat(data, output_sr));
-
-        plot_save("audio_draft_quality", "audio_draft_quality.wav", "");
-    }
-
-    return 0;
-}
diff --git a/examples/sample_rate_conversion.cpp b/examples/sample_rate_conversion.cpp
@@ -0,0 +1,105 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+// library_version()
+#include <kfr/version.hpp>
+
+// print(), format()
+#include <kfr/cometa/string.hpp>
+
+#include <kfr/math.hpp>
+
+// resample*
+#include <kfr/dsp/sample_rate_conversion.hpp>
+
+// file*
+#include <kfr/io/audiofile.hpp>
+
+// swept
+#include <kfr/dsp/oscillators.hpp>
+
+// plot_save()
+#include <kfr/io/python_plot.hpp>
+
+#include <iostream>
+
+using namespace kfr;
+
+constexpr size_t input_sr  = 96000;
+constexpr size_t output_sr = 44100;
+constexpr size_t len       = 96000 * 6;
+constexpr fbase i32max     = 2147483647.0;
+
+int main(int argc, char** argv)
+{
+    println(library_version());
+
+    const std::string options = "phaseresp=False";
+
+    univector<fbase> swept_sine = swept<fbase>(0.5, len);
+
+    {
+        auto r = resampler<fbase>(resample_quality::high, output_sr, input_sr, 1.0, 0.496);
+        univector<fbase> resampled(len * output_sr / input_sr);
+
+        const size_t destsize = r(resampled.data(), swept_sine);
+
+        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
+        univector2d<i32> data  = { i32data };
+
+        auto wr = sequential_file_writer("audio_high_quality.wav");
+        audio_encode(wr, data, audioformat(data, output_sr));
+
+        plot_save("audio_high_quality", "audio_high_quality.wav", "");
+    }
+
+    {
+        auto r = resampler<fbase>(resample_quality::normal, output_sr, input_sr, 1.0, 0.496);
+        univector<fbase> resampled(len * output_sr / input_sr);
+
+        const size_t destsize = r(resampled.data(), swept_sine);
+
+        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
+        univector2d<i32> data  = { i32data };
+
+        auto wr = sequential_file_writer("audio_normal_quality.wav");
+        audio_encode(wr, data, audioformat(data, output_sr));
+
+        plot_save("audio_normal_quality", "audio_normal_quality.wav", "");
+    }
+
+    {
+        auto r = resampler<fbase>(resample_quality::low, output_sr, input_sr, 1.0, 0.496);
+        univector<fbase> resampled(len * output_sr / input_sr);
+
+        const size_t destsize = r(resampled.data(), swept_sine);
+
+        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
+        univector2d<i32> data  = { i32data };
+
+        auto wr = sequential_file_writer("audio_low_quality.wav");
+        audio_encode(wr, data, audioformat(data, output_sr));
+
+        plot_save("audio_low_quality", "audio_low_quality.wav", "");
+    }
+
+    {
+        auto r = resampler<fbase>(resample_quality::draft, output_sr, input_sr, 1.0, 0.496);
+        univector<fbase> resampled(len * output_sr / input_sr);
+
+        const size_t destsize = r(resampled.data(), swept_sine);
+
+        univector<i32> i32data = clamp(resampled.slice(0, destsize) * i32max, -i32max, +i32max);
+        univector2d<i32> data  = { i32data };
+
+        auto wr = sequential_file_writer("audio_draft_quality.wav");
+        audio_encode(wr, data, audioformat(data, output_sr));
+
+        plot_save("audio_draft_quality", "audio_draft_quality.wav", "");
+    }
+
+    return 0;
+}
diff --git a/examples/window.cpp b/examples/window.cpp
@@ -30,7 +30,7 @@ int main(int argc, char** argv)
     const std::string options = "freqresp=True, dots=True, padwidth=1024, "
                                 "log_freq=False, horizontal=False, normalized_freq=True";
 
-    univector<double, 64> output;
+    univector<fbase, 64> output;
     output = window_hann(output.size());
     plot_save("window_hann", output, options + ", title='Hann window'");
 
diff --git a/include/kfr/all.hpp b/include/kfr/all.hpp
@@ -21,73 +21,7 @@
  * See http://www.kfrlib.com for details.
  */
 
-#include "cometa/string.hpp"
-
-#include "base/abs.hpp"
-#include "base/asin_acos.hpp"
-#include "base/atan.hpp"
-#include "base/basic_expressions.hpp"
-#include "base/clamp.hpp"
-#include "base/compiletime.hpp"
-#include "base/complex.hpp"
-#include "base/constants.hpp"
-#include "base/conversion.hpp"
-#include "base/cpuid.hpp"
-#include "base/cpuid_auto.hpp"
-#include "base/digitreverse.hpp"
-#include "base/function.hpp"
-#include "base/gamma.hpp"
-#include "base/generators.hpp"
-#include "base/hyperbolic.hpp"
-#include "base/log_exp.hpp"
-#include "base/logical.hpp"
-#include "base/memory.hpp"
-#include "base/min_max.hpp"
-#include "base/modzerobessel.hpp"
-#include "base/operators.hpp"
-#include "base/pointer.hpp"
-#include "base/random.hpp"
-#include "base/read_write.hpp"
-#include "base/reduce.hpp"
-#include "base/round.hpp"
-#include "base/saturation.hpp"
-#include "base/select.hpp"
-#include "base/shuffle.hpp"
-#include "base/sin_cos.hpp"
-#include "base/small_buffer.hpp"
-#include "base/sort.hpp"
-#include "base/sqrt.hpp"
-#include "base/tan.hpp"
-#include "base/types.hpp"
-#include "base/univector.hpp"
-#include "base/vec.hpp"
-#include "version.hpp"
-
-#include "data/bitrev.hpp"
-#include "data/sincos.hpp"
-#include "dsp/biquad.hpp"
-#include "dsp/biquad_design.hpp"
-#include "dsp/fir.hpp"
-#include "dsp/fir_design.hpp"
-#include "dsp/fracdelay.hpp"
-#include "dsp/goertzel.hpp"
-#include "dsp/impulse.hpp"
-#include "dsp/interpolation.hpp"
-#include "dsp/oscillators.hpp"
-#include "dsp/resample.hpp"
-#include "dsp/speaker.hpp"
-#include "dsp/units.hpp"
-#include "dsp/waveshaper.hpp"
-#include "dsp/weighting.hpp"
-#include "dsp/window.hpp"
-#include "io/audiofile.hpp"
-#include "io/file.hpp"
-#include "io/python_plot.hpp"
-#include "io/tostring.hpp"
-#include "math.hpp"
-
-#include "dft/bitrev.hpp"
-#include "dft/conv.hpp"
-#include "dft/fft.hpp"
-#include "dft/ft.hpp"
-#include "dft/reference_dft.hpp"
+#include "base.hpp"
+#include "dft.hpp"
+#include "dsp.hpp"
+#include "io.hpp"
diff --git a/include/kfr/base.hpp b/include/kfr/base.hpp
@@ -0,0 +1,63 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "base/abs.hpp"
+#include "base/asin_acos.hpp"
+#include "base/atan.hpp"
+#include "base/basic_expressions.hpp"
+#include "base/clamp.hpp"
+#include "base/compiletime.hpp"
+#include "base/complex.hpp"
+#include "base/constants.hpp"
+#include "base/conversion.hpp"
+#include "base/cpuid.hpp"
+#include "base/cpuid_auto.hpp"
+#include "base/digitreverse.hpp"
+#include "base/expression.hpp"
+#include "base/function.hpp"
+#include "base/gamma.hpp"
+#include "base/generators.hpp"
+#include "base/hyperbolic.hpp"
+#include "base/log_exp.hpp"
+#include "base/logical.hpp"
+#include "base/memory.hpp"
+#include "base/min_max.hpp"
+#include "base/modzerobessel.hpp"
+#include "base/operators.hpp"
+#include "base/pointer.hpp"
+#include "base/random.hpp"
+#include "base/read_write.hpp"
+#include "base/reduce.hpp"
+#include "base/round.hpp"
+#include "base/saturation.hpp"
+#include "base/select.hpp"
+#include "base/shuffle.hpp"
+#include "base/sin_cos.hpp"
+#include "base/small_buffer.hpp"
+#include "base/sort.hpp"
+#include "base/sqrt.hpp"
+#include "base/tan.hpp"
+#include "base/types.hpp"
+#include "base/univector.hpp"
+#include "base/vec.hpp"
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -31,6 +31,9 @@ namespace kfr
 
 namespace intrinsics
 {
+
+#if defined CMT_ARCH_SSSE3
+
 // floating point
 template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
 KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
@@ -38,8 +41,6 @@ KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
     return x & internal::invhighbitmask<T>;
 }
 
-#if defined CID_ARCH_SSSE3
-
 KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); }
 KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); }
 KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); }
@@ -49,7 +50,7 @@ KFR_SINTRIN u32sse abs(const u32sse& x) { return x; }
 KFR_SINTRIN u16sse abs(const u16sse& x) { return x; }
 KFR_SINTRIN u8sse abs(const u8sse& x) { return x; }
 
-#if defined CID_ARCH_AVX2
+#if defined CMT_ARCH_AVX2
 KFR_SINTRIN i64avx abs(const i64avx& x) { return select(x >= 0, x, -x); }
 KFR_SINTRIN i32avx abs(const i32avx& x) { return _mm256_abs_epi32(*x); }
 KFR_SINTRIN i16avx abs(const i16avx& x) { return _mm256_abs_epi16(*x); }
@@ -62,7 +63,39 @@ KFR_SINTRIN u8avx abs(const u8avx& x) { return x; }
 
 KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
 
+#elif defined CMT_ARCH_NEON
+
+KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); }
+KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); }
+KFR_SINTRIN i32neon abs(const i32neon& x) { return vabsq_s32(*x); }
+#if defined CMT_ARCH_NEON64
+KFR_SINTRIN i64neon abs(const i64neon& x) { return vabsq_s64(*x); }
+#else
+KFR_SINTRIN i64neon abs(const i64neon& x) { return select(x >= 0, x, -x); }
+#endif
+
+KFR_SINTRIN u8neon abs(const u8neon& x) { return x; }
+KFR_SINTRIN u16neon abs(const u16neon& x) { return x; }
+KFR_SINTRIN u32neon abs(const u32neon& x) { return x; }
+KFR_SINTRIN u64neon abs(const u64neon& x) { return x; }
+
+KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); }
+#if defined CMT_ARCH_NEON64
+KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); }
 #else
+KFR_SINTRIN f64neon abs(const f64neon& x) { return x & internal::invhighbitmask<f64>; }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1(abs)
+
+#else
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
+{
+    return x & internal::invhighbitmask<T>;
+}
 
 // fallback
 template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp
@@ -36,14 +36,14 @@ namespace intrinsics
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> asin(const vec<T, N>& x)
 {
-    const vec<Tout, N> xx = cast<Tout>(x);
+    const vec<Tout, N> xx = x;
     return atan2(xx, sqrt(Tout(1) - xx * xx));
 }
 
 template <typename T, size_t N, typename Tout = flt_type<T>>
 KFR_SINTRIN vec<Tout, N> acos(const vec<T, N>& x)
 {
-    const vec<Tout, N> xx = cast<Tout>(x);
+    const vec<Tout, N> xx = x;
     return atan2(sqrt(Tout(1) - xx * xx), xx);
 }
 KFR_I_CONVERTER(asin)
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -65,14 +65,14 @@ KFR_SINTRIN vec<f64, N> atan2k(vec<f64, N> y, vec<f64, N> x)
 {
     vec<f64, N> s, t, u;
     vec<i64, N> q;
-    q = select(x < 0, -2ll, 0ll);
+    q = select(x < 0, i64(-2), i64(0));
     x = select(x < 0, -x, x);
-    vec<i64, N> m;
+    mask<i64, N> m;
     m = y > x;
     t = x;
     x = select(m, y, x);
     y = select(m, -t, y);
-    q = select(m, q + 1ll, q);
+    q = select(m, q + i64(1), q);
     s = y / x;
     t = s * s;
     u = -1.88796008463073496563746e-05;
@@ -122,8 +122,8 @@ KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
     constexpr f64 pi_over_2 = 1.5707963267948966192313216916398;
     constexpr f64 pi_over_4 = 0.78539816339744830961566084581988;
     r                       = mulsign(r, x);
-    r = select(isinf(x) || x == 0.0, pi_over_2 - select(x, mulsign(pi_over_2, x), 0.0), r);
-    r = select(isinf(y), pi_over_2 - select(x, mulsign(pi_over_4, x), 0.0), r);
+    r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r);
+    r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r);
     r = select(y == 0.0, fbitcast(ibitcast(x < 0) & ibitcast(pi)), r);
     r = fbitcast(ibitcast(isnan(x) || isnan(y)) | ibitcast(mulsign(r, y)));
     return r;
@@ -158,7 +158,7 @@ KFR_SINTRIN vec<f64, N> atan(const vec<f64, N>& s)
 {
     vec<f64, N> t, u;
     vec<i64, N> q;
-    q = select(s < 0.0, 2ll, 0ll);
+    q = select(s < 0.0, i64(2), i64(0));
     s = select(s < 0.0, -s, s);
     q = select(s > 1.0, q | 1, q);
     s = select(s > 1.0, 1.0 / s, s);
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -36,8 +36,8 @@ struct expression_iterator
     constexpr expression_iterator(E1&& e1) : e1(std::forward<E1>(e1)) {}
     struct iterator
     {
-        T operator*() { return get(); }
-        T get() { return expr.e1(cinput, position, vec_t<T, 1>())[0]; }
+        T operator*() const { return get(); }
+        T get() const { return expr.e1(cinput, position, vec_t<T, 1>())[0]; }
         iterator& operator++()
         {
             ++position;
@@ -50,40 +50,40 @@ struct expression_iterator
             return copy;
         }
         bool operator!=(const iterator& other) const { return position != other.position; }
-        expression_iterator& expr;
+        const expression_iterator& expr;
         size_t position;
     };
-    iterator begin() { return { *this, 0 }; }
-    iterator end() { return { *this, e1.size() }; }
+    iterator begin() const { return { *this, 0 }; }
+    iterator end() const { return { *this, e1.size() }; }
     E1 e1;
 };
 }
 
 template <typename E1, typename T = value_type_of<E1>>
-KFR_INLINE internal::expression_iterator<T, E1> to_iterator(E1&& e1)
+CMT_INLINE internal::expression_iterator<T, E1> to_iterator(E1&& e1)
 {
     return internal::expression_iterator<T, E1>(std::forward<E1>(e1));
 }
 
 template <typename T, typename... Ts>
-KFR_INLINE auto sequence(T x, Ts... rest)
+CMT_INLINE auto sequence(T x, Ts... rest)
 {
     const T seq[]      = { x, static_cast<T>(rest)... };
     constexpr size_t N = arraysize(seq);
     return lambda([=](size_t index) { return seq[index % N]; });
 }
-KFR_INLINE auto zeros()
+CMT_INLINE auto zeros()
 {
     return lambda([](cinput_t, size_t, auto x) { return zerovector(x); });
 }
-KFR_INLINE auto ones()
+CMT_INLINE auto ones()
 {
     return lambda([](cinput_t, size_t, auto x) {
         using U = subtype<decltype(x)>;
         return U(1);
     });
 }
-KFR_INLINE auto counter()
+CMT_INLINE auto counter()
 {
     return lambda([](cinput_t, size_t index, auto x) {
         using T    = subtype<decltype(x)>;
@@ -93,7 +93,7 @@ KFR_INLINE auto counter()
     });
 }
 template <typename T1>
-KFR_INLINE auto counter(T1 start)
+CMT_INLINE auto counter(T1 start)
 {
     return lambda([start](cinput_t, size_t index, auto x) {
         using T    = subtype<decltype(x)>;
@@ -103,7 +103,7 @@ KFR_INLINE auto counter(T1 start)
     });
 }
 template <typename T1, typename T2>
-KFR_INLINE auto counter(T1 start, T2 step)
+CMT_INLINE auto counter(T1 start, T2 step)
 {
     return lambda([start, step](cinput_t, size_t index, auto x) {
         using T    = subtype<decltype(x)>;
@@ -135,13 +135,13 @@ template <typename T, typename E1>
 struct expression_reader
 {
     constexpr expression_reader(E1&& e1) noexcept : e1(std::forward<E1>(e1)) {}
-    T read()
+    T read() const
     {
         const T result = e1(cinput, m_position, vec_t<T, 1>());
         m_position++;
         return result;
     }
-    size_t m_position = 0;
+    mutable size_t m_position = 0;
     E1 e1;
 };
 template <typename T, typename E1>
@@ -192,7 +192,7 @@ struct expression_skip : expression<E1>, inherit_value_type<E1>
 {
     expression_skip(E1&& e1, size_t count) : expression<E1>(std::forward<E1>(e1)), count(count) {}
     template <typename T, size_t N>
-    KFR_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> y) const
+    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> y) const
     {
         return this->argument_first(index + count, y);
     }
@@ -218,7 +218,7 @@ struct expression_linspace<T, false> : input_expression
     }
 
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
     {
         using UI = itype<U>;
         return U(start) + (enumerate(x) + cast<U>(cast<UI>(index))) * U(offset);
@@ -242,13 +242,13 @@ struct expression_linspace<T, true> : input_expression
     }
 
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
     {
         using UI = itype<U>;
         return mix((enumerate(x) + cast<U>(cast<UI>(index))) * invsize, cast<U>(start), cast<U>(stop));
     }
     template <typename U, size_t N>
-    KFR_INLINE static vec<U, N> mix(vec<U, N> t, U x, U y)
+    CMT_INLINE static vec<U, N> mix(vec<U, N> t, U x, U y)
     {
         return (U(1.0) - t) * x + t * y;
     }
@@ -265,7 +265,7 @@ public:
     using base = expression<E...>;
 
     template <typename... Expr_>
-    KFR_INLINE expression_sequence(const size_t (&segments)[base::size], Expr_&&... expr) noexcept
+    CMT_INLINE expression_sequence(const size_t (&segments)[base::size], Expr_&&... expr) noexcept
         : base(std::forward<Expr_>(expr)...)
     {
         std::copy(std::begin(segments), std::end(segments), this->segments.begin() + 1);
@@ -274,7 +274,7 @@ public:
     }
 
     template <typename T, size_t N>
-    KFR_NOINLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> y) const
+    CMT_NOINLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> y) const
     {
         std::size_t sindex = size_t(std::upper_bound(std::begin(segments), std::end(segments), index) - 1 -
                                     std::begin(segments));
@@ -296,7 +296,7 @@ public:
 
 protected:
     template <typename T, size_t N>
-    KFR_NOINLINE vec<T, N> get(size_t index, size_t expr_index, vec_t<T, N> y)
+    CMT_NOINLINE vec<T, N> get(size_t index, size_t expr_index, vec_t<T, N> y)
     {
         return cswitch(indicesfor<E...>, expr_index, [&](auto val) { return this->argument(val, index, y); },
                        [&]() { return zerovector(y); });
@@ -307,13 +307,13 @@ protected:
 }
 
 template <typename E1>
-KFR_INLINE internal::expression_skip<E1> skip(E1&& e1, size_t count = 1)
+CMT_INLINE internal::expression_skip<E1> skip(E1&& e1, size_t count = 1)
 {
     return internal::expression_skip<E1>(std::forward<E1>(e1), count);
 }
 
 template <typename T1, typename T2, bool precise = false, typename TF = ftype<common_type<T1, T2>>>
-KFR_INLINE internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size,
+CMT_INLINE internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size,
                                                                bool endpoint = false)
 {
     return internal::expression_linspace<TF, precise>(start, stop, size, endpoint);
@@ -321,7 +321,7 @@ KFR_INLINE internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop
 KFR_FN(linspace)
 
 template <typename T, bool precise = false, typename TF = ftype<T>>
-KFR_INLINE internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size,
+CMT_INLINE internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size,
                                                                    bool endpoint = false)
 {
     return internal::expression_linspace<TF, precise>(symmetric_linspace, symsize, size, endpoint);
@@ -329,7 +329,7 @@ KFR_INLINE internal::expression_linspace<TF, precise> symmlinspace(T symsize, si
 KFR_FN(symmlinspace)
 
 template <size_t size, typename... E>
-KFR_INLINE internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens)
+CMT_INLINE internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens)
 {
     static_assert(size == sizeof...(E), "Lists must be of equal length");
     return internal::expression_sequence<decay<E>...>(list, std::forward<E>(gens)...);
@@ -348,7 +348,8 @@ struct multioutput : output_expression
     template <typename T, size_t N>
     void operator()(coutput_t, size_t index, const vec<T, N>& x)
     {
-        cfor(csize<0>, csize<sizeof...(E)>, [&](auto n) { std::get<val_of(n)>(outputs)(coutput, index, x); });
+        cfor(csize<0>, csize<sizeof...(E)>,
+             [&](auto n) { std::get<val_of(decltype(n)())>(outputs)(coutput, index, x); });
     }
     std::tuple<E...> outputs;
 
diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp
@@ -100,10 +100,12 @@ namespace cometa
 template <typename T>
 struct compound_type_traits<kfr::complex<T>>
 {
-    constexpr static size_t width   = 2;
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar = false;
+    constexpr static size_t width      = 2;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::complex<U>;
     template <typename U>
@@ -155,41 +157,41 @@ struct vec_op<complex<T>> : private vec_op<T>
 };
 
 template <typename T, size_t N>
-KFR_INLINE vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x)
+CMT_INLINE vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x)
 {
-    return subcast<complex<T>>(dupeven(subcast<T>(x)));
+    return compcast<complex<T>>(dupeven(compcast<T>(x)));
 }
 KFR_FN(cdupreal)
 
 template <typename T, size_t N>
-KFR_INLINE vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x)
+CMT_INLINE vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x)
 {
-    return subcast<complex<T>>(dupodd(subcast<T>(x)));
+    return compcast<complex<T>>(dupodd(compcast<T>(x)));
 }
 KFR_FN(cdupimag)
 
 template <typename T, size_t N>
-KFR_INLINE vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x)
+CMT_INLINE vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x)
 {
-    return subcast<complex<T>>(swap<2>(subcast<T>(x)));
+    return compcast<complex<T>>(swap<2>(compcast<T>(x)));
 }
 KFR_FN(cswapreim)
 
 template <typename T, size_t N>
-KFR_INLINE vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x)
+CMT_INLINE vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x)
 {
     return x ^ complex<T>(-T(), T());
 }
 KFR_FN(cnegreal)
 template <typename T, size_t N>
-KFR_INLINE vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x)
+CMT_INLINE vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x)
 {
     return x ^ complex<T>(T(), -T());
 }
 KFR_FN(cnegimag)
 
 template <typename T, size_t N>
-KFR_INLINE vec<complex<T>, N> cconj(const vec<complex<T>, N>& x)
+CMT_INLINE vec<complex<T>, N> cconj(const vec<complex<T>, N>& x)
 {
     return cnegimag(x);
 }
@@ -205,52 +207,54 @@ template <typename T>
 struct is_complex_impl<complex<T>> : std::true_type
 {
 };
-}
-
-// real to complex
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<From, N>& value) noexcept
-{
-    const vec<subtype<To>, N> casted = cast<subtype<To>>(value);
-    return subcast<To>(interleave(casted, zerovector(casted)));
-}
 
-// complex to complex
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept
+// vector<complex> to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<complex<From>, N>>
 {
-    return subcast<To>(cast<subtype<To>>(subcast<From>(value)));
-}
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<complex<To>, N> cast(const vec<complex<From>, N>& value)
+    {
+        return builtin_convertvector<complex<To>>(value);
+    }
+};
 
-// complex to real
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(!internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept
+// vector to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<From, N>>
 {
-    static_assert(sizeof(To) == 0, "Can't cast complex to real");
-    return {};
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<complex<To>, N> cast(const vec<From, N>& value)
+    {
+        const vec<To, N> casted = static_cast<vec<To, N>>(value);
+        return *interleave(casted, zerovector(casted));
+    }
+};
 }
 
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<complex<T>, N / 2> ccomp(const vec<T, N>& x)
+constexpr CMT_INLINE vec<complex<T>, N / 2> ccomp(const vec<T, N>& x)
 {
-    return subcast<complex<T>>(x);
+    return compcast<complex<T>>(x);
 }
 
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N * 2> cdecom(const vec<complex<T>, N>& x)
+constexpr CMT_INLINE vec<T, N * 2> cdecom(const vec<complex<T>, N>& x)
 {
-    return subcast<T>(x);
+    return compcast<T>(x);
 }
 
 template <typename T>
-constexpr KFR_INLINE T real(const complex<T>& value)
+constexpr CMT_INLINE T real(const complex<T>& value)
 {
     return value.real();
 }
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> real(const vec<complex<T>, N>& value)
+constexpr CMT_INLINE vec<T, N> real(const vec<complex<T>, N>& value)
 {
-    return even(subcast<T>(value));
+    return even(compcast<T>(value));
 }
 
 template <typename T>
@@ -260,36 +264,36 @@ using realftype = ftype<decltype(kfr::real(std::declval<T>()))>;
 
 KFR_FN(real)
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_real, E1> real(E1&& x)
+CMT_INLINE internal::expression_function<fn_real, E1> real(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
 
 template <typename T>
-constexpr KFR_INLINE T imag(const complex<T>& value)
+constexpr CMT_INLINE T imag(const complex<T>& value)
 {
     return value.imag();
 }
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> imag(const vec<complex<T>, N>& value)
+constexpr CMT_INLINE vec<T, N> imag(const vec<complex<T>, N>& value)
 {
-    return odd(subcast<T>(value));
+    return odd(compcast<T>(value));
 }
 KFR_FN(imag)
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_imag, E1> imag(E1&& x)
+CMT_INLINE internal::expression_function<fn_imag, E1> imag(E1&& x)
 {
     return { {}, std::forward<E1>(x) };
 }
 
 template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>>
-constexpr KFR_INLINE vec<complex<T>, N> make_complex(const vec<T1, N>& real, const vec<T2, N>& imag = T2(0))
+constexpr CMT_INLINE vec<complex<T>, N> make_complex(const vec<T1, N>& real, const vec<T2, N>& imag = T2(0))
 {
-    return subcast<complex<T>>(interleave(cast<T>(real), cast<T>(imag)));
+    return compcast<complex<T>>(interleave(cast<T>(real), cast<T>(imag)));
 }
 
 template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>>
-constexpr KFR_INLINE complex<T> make_complex(T1 real, T2 imag = T2(0))
+constexpr CMT_INLINE complex<T> make_complex(T1 real, T2 imag = T2(0))
 {
     return complex<T>(cast<T>(real), cast<T>(imag));
 }
diff --git a/include/kfr/base/conversion.hpp b/include/kfr/base/conversion.hpp
@@ -35,10 +35,10 @@ namespace internal
 template <typename From, typename E>
 struct expression_convert : expression<E>
 {
-    KFR_INLINE expression_convert(E&& expr) noexcept : expression<E>(std::forward<E>(expr)) {}
+    CMT_INLINE expression_convert(E&& expr) noexcept : expression<E>(std::forward<E>(expr)) {}
 
     template <typename T, size_t N>
-    KFR_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
     {
         return this->argument_first(index, vec_t<From, N>());
     }
@@ -46,7 +46,7 @@ struct expression_convert : expression<E>
 }
 
 template <typename From, typename E>
-KFR_INLINE internal::expression_convert<From, decay<E>> convert(E&& expr)
+CMT_INLINE internal::expression_convert<From, decay<E>> convert(E&& expr)
 {
     return internal::expression_convert<From, decay<E>>(std::forward<E>(expr));
 }
diff --git a/include/kfr/base/cpuid.hpp b/include/kfr/base/cpuid.hpp
@@ -27,6 +27,7 @@
 
 namespace kfr
 {
+#ifdef CMT_ARCH_X86
 
 struct cpu_features
 {
@@ -104,17 +105,17 @@ struct cpu_data
     u32 data[4];
 };
 
-#if defined KFR_COMPILER_GNU || defined KFR_COMPILER_CLANG
-KFR_INLINE u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx)
+#if defined CMT_COMPILER_GNU || defined CMT_COMPILER_CLANG
+CMT_INLINE u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx)
 {
     __asm__("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(func), "2"(subfunc));
     return 1;
 }
-KFR_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
+CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
 {
     get_cpuid(func, subfunc, &ptr[0], &ptr[1], &ptr[2], &ptr[3]);
 }
-KFR_INLINE u32 get_xcr0()
+CMT_INLINE u32 get_xcr0()
 {
     u32 xcr0;
     __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
@@ -244,31 +245,28 @@ cpu_t detect_cpu()
     c.hasAVXOSSUPPORT    = c.hasAVX && c.hasOSXSAVE && (get_xcr0() & 0x06) == 0x06;
     c.hasAVX512OSSUPPORT = c.hasAVXOSSUPPORT && c.hasAVX512F && c.hasOSXSAVE && (get_xcr0() & 0xE0) == 0xE0;
 
-#ifdef KFR_AVAIL_AVX2
     if (c.hasAVX2 && c.hasAVXOSSUPPORT)
         return cpu_t::avx2;
-#endif
-#ifdef KFR_AVAIL_AVX
     if (c.hasAVX && c.hasAVXOSSUPPORT)
         return cpu_t::avx1;
-#endif
-#ifdef KFR_AVAIL_SSE41
     if (c.hasSSE41)
         return cpu_t::sse41;
-#endif
-#ifdef KFR_AVAIL_SSSE3
     if (c.hasSSSE3)
         return cpu_t::ssse3;
-#endif
-#ifdef KFR_AVAIL_SSE3
     if (c.hasSSE3)
         return cpu_t::sse3;
-#endif
-#ifdef KFR_AVAIL_SSE2
     if (c.hasSSE2)
         return cpu_t::sse2;
-#endif
     return cpu_t::lowest;
 }
 }
+#else
+
+template <size_t = 0>
+cpu_t detect_cpu()
+{
+    return cpu_t::native;
+}
+
+#endif
 }
diff --git a/include/kfr/base/cpuid_auto.hpp b/include/kfr/base/cpuid_auto.hpp
@@ -29,19 +29,19 @@ namespace kfr
 namespace internal
 {
 
-KFR_INLINE cpu_t& cpu_v()
+CMT_INLINE cpu_t& cpu_v()
 {
     static cpu_t v1 = cpu_t::native;
     return v1;
 }
 
-KFR_INLINE char init_cpu_v()
+CMT_INLINE char init_cpu_v()
 {
     cpu_v() = detect_cpu<0>();
     return 0;
 }
 
-KFR_INLINE char init_dummyvar()
+CMT_INLINE char init_dummyvar()
 {
     static char dummy = init_cpu_v();
     return dummy;
@@ -49,5 +49,5 @@ KFR_INLINE char init_dummyvar()
 
 static char dummyvar = init_dummyvar();
 }
-KFR_INLINE cpu_t get_cpu() { return internal::cpu_v(); }
+CMT_INLINE cpu_t get_cpu() { return internal::cpu_v(); }
 }
diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp
@@ -90,19 +90,19 @@ struct shuffle_index_digitreverse
 }
 
 template <size_t radix, size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> digitreverse(const vec<T, N>& x)
+CMT_INLINE vec<T, N> digitreverse(const vec<T, N>& x)
 {
     return shufflevector<N, internal::shuffle_index_digitreverse<radix, ilog2(N / groupsize)>, groupsize>(x);
 }
 
 template <size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> bitreverse(const vec<T, N>& x)
+CMT_INLINE vec<T, N> bitreverse(const vec<T, N>& x)
 {
     return digitreverse<2, groupsize>(x);
 }
 
 template <size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> digitreverse4(const vec<T, N>& x)
+CMT_INLINE vec<T, N> digitreverse4(const vec<T, N>& x)
 {
     return digitreverse<4, groupsize>(x);
 }
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -65,11 +65,11 @@ struct expression : input_expression
     expression()                  = delete;
     constexpr expression(Args&&... args) noexcept : args(std::forward<Args>(args)...) {}
 
-    KFR_INLINE void begin_block(size_t size) { begin_block_impl(size, indicesfor_t<Args...>()); }
-    KFR_INLINE void end_block(size_t size) { end_block_impl(size, indicesfor_t<Args...>()); }
+    CMT_INLINE void begin_block(size_t size) { begin_block_impl(size, indicesfor_t<Args...>()); }
+    CMT_INLINE void end_block(size_t size) { end_block_impl(size, indicesfor_t<Args...>()); }
 
-    KFR_INLINE void begin_block(size_t size) const { begin_block_impl(size, indicesfor_t<Args...>()); }
-    KFR_INLINE void end_block(size_t size) const { end_block_impl(size, indicesfor_t<Args...>()); }
+    CMT_INLINE void begin_block(size_t size) const { begin_block_impl(size, indicesfor_t<Args...>()); }
+    CMT_INLINE void end_block(size_t size) const { end_block_impl(size, indicesfor_t<Args...>()); }
 
 protected:
     std::tuple<Args...> args;
@@ -81,57 +81,56 @@ protected:
     }
 
     template <typename Fn, typename T, size_t N>
-    KFR_INLINE vec<T, N> call(Fn&& fn, size_t index, vec_t<T, N> x) const
+    CMT_INLINE vec<T, N> call(Fn&& fn, size_t index, vec_t<T, N> x) const
     {
         return call_impl(std::forward<Fn>(fn), indicesfor_t<Args...>(), index, x);
     }
     template <size_t ArgIndex, typename T, size_t N>
-    KFR_INLINE vec<T, N> argument(csize_t<ArgIndex>, size_t index, vec_t<T, N> x) const
+    CMT_INLINE vec<T, N> argument(csize_t<ArgIndex>, size_t index, vec_t<T, N> x) const
     {
         static_assert(ArgIndex < count, "Incorrect ArgIndex");
         return std::get<ArgIndex>(this->args)(cinput, index, x);
     }
     template <typename T, size_t N>
-    KFR_INLINE vec<T, N> argument_first(size_t index, vec_t<T, N> x) const
+    CMT_INLINE vec<T, N> argument_first(size_t index, vec_t<T, N> x) const
     {
         return std::get<0>(this->args)(cinput, index, x);
     }
 
 private:
     template <typename Arg, size_t N, typename Tin,
-              typename Tout1 = conditional<is_generic<Arg>::value, Tin, typename decay<Arg>::value_type>,
-              typename Tout  = Tout1>
-    KFR_INLINE vec_t<Tout, N> vec_t_for() const
+              typename Tout = conditional<is_generic<Arg>::value, Tin, value_type_of<Arg>>>
+    CMT_INLINE vec_t<Tout, N> vec_t_for() const
     {
         return {};
     }
     template <typename Fn, typename T, size_t N, size_t... indices>
-    KFR_INLINE vec<T, N> call_impl(Fn&& fn, csizes_t<indices...>, size_t index, vec_t<T, N>) const
+    CMT_INLINE vec<T, N> call_impl(Fn&& fn, csizes_t<indices...>, size_t index, vec_t<T, N>) const
     {
         using ratio          = func_ratio<Fn>;
         constexpr size_t Nin = N * ratio::input / ratio::output;
         using Tout = conditional<is_same<generic, value_type>::value, T, common_type<T, value_type>>;
 
-        return cast<T>(fn(cast<Tout>(std::get<indices>(this->args)(
-            cinput, index * ratio::input / ratio::output, vec_t_for<Args, Nin, Tout>()))...));
+        return fn(std::get<indices>(this->args)(cinput, index * ratio::input / ratio::output,
+                                                vec_t_for<Args, Nin, Tout>())...);
     }
     template <size_t... indices>
-    KFR_INLINE void begin_block_impl(size_t size, csizes_t<indices...>)
+    CMT_INLINE void begin_block_impl(size_t size, csizes_t<indices...>)
     {
         swallow{ (std::get<indices>(args).begin_block(size), 0)... };
     }
     template <size_t... indices>
-    KFR_INLINE void end_block_impl(size_t size, csizes_t<indices...>)
+    CMT_INLINE void end_block_impl(size_t size, csizes_t<indices...>)
     {
         swallow{ (std::get<indices>(args).end_block(size), 0)... };
     }
     template <size_t... indices>
-    KFR_INLINE void begin_block_impl(size_t size, csizes_t<indices...>) const
+    CMT_INLINE void begin_block_impl(size_t size, csizes_t<indices...>) const
     {
         swallow{ (std::get<indices>(args).begin_block(size), 0)... };
     }
     template <size_t... indices>
-    KFR_INLINE void end_block_impl(size_t size, csizes_t<indices...>) const
+    CMT_INLINE void end_block_impl(size_t size, csizes_t<indices...>) const
     {
         swallow{ (std::get<indices>(args).end_block(size), 0)... };
     }
@@ -147,9 +146,9 @@ struct expression_scalar : input_expression
     const vec<T, width> val;
 
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N>) const
     {
-        return resize<N>(cast<U>(val));
+        return resize<N>(static_cast<vec<U, width>>(val));
     }
 };
 
@@ -185,7 +184,7 @@ struct expression_function : expression<arg<Args>...>
     {
     }
     template <typename T, size_t N>
-    KFR_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const
+    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const
     {
         static_assert(is_same<T, value_type_of<expression_function>>::value ||
                           is_generic<expression_function>::value,
@@ -198,37 +197,37 @@ protected:
 };
 
 template <typename Tout, typename Tin, size_t width, typename OutFn, typename Fn>
-KFR_INLINE void process_cycle(OutFn&& outfn, const Fn& fn, size_t& i, size_t size)
+CMT_INLINE void process_cycle(OutFn&& outfn, const Fn& fn, size_t& i, size_t size)
 {
     const size_t count = size / width * width;
-    KFR_LOOP_NOUNROLL
+    CMT_LOOP_NOUNROLL
     for (; i < count; i += width)
     {
-        outfn(coutput, i, cast<Tout>(fn(cinput, i, vec_t<Tin, width>())));
+        outfn(coutput, i, fn(cinput, i, vec_t<Tin, width>()));
     }
 }
 }
 
 template <typename A>
-KFR_INLINE internal::arg<A> e(A&& a)
+CMT_INLINE internal::arg<A> e(A&& a)
 {
     return internal::arg<A>(std::forward<A>(a));
 }
 
 template <typename T>
-KFR_INLINE internal::expression_scalar<T> scalar(const T& val)
+CMT_INLINE internal::expression_scalar<T> scalar(const T& val)
 {
     return internal::expression_scalar<T>(val);
 }
 
 template <typename T, size_t N>
-KFR_INLINE internal::expression_scalar<T, N> scalar(const vec<T, N>& val)
+CMT_INLINE internal::expression_scalar<T, N> scalar(const vec<T, N>& val)
 {
     return internal::expression_scalar<T, N>(val);
 }
 
 template <typename Fn, typename... Args>
-KFR_INLINE internal::expression_function<decay<Fn>, internal::arg<Args>...> bind_expression(Fn&& fn,
+CMT_INLINE internal::expression_function<decay<Fn>, internal::arg<Args>...> bind_expression(Fn&& fn,
                                                                                             Args&&... args)
 {
     return internal::expression_function<decay<Fn>, internal::arg<Args>...>(std::forward<Fn>(fn),
@@ -236,7 +235,7 @@ KFR_INLINE internal::expression_function<decay<Fn>, internal::arg<Args>...> bind
 }
 
 template <typename Tout, cpu_t c = cpu_t::native, size_t width = 0, typename OutFn, typename Fn>
-KFR_INLINE void process(OutFn&& outfn, const Fn& fn, size_t size)
+CMT_INLINE void process(OutFn&& outfn, const Fn& fn, size_t size)
 {
     static_assert(is_output_expression<OutFn>::value, "OutFn must be an expression");
     static_assert(is_input_expression<Fn>::value, "Fn must be an expression");
@@ -267,9 +266,9 @@ struct expressoin_typed : input_expression
     expressoin_typed(E1&& e1) : e1(std::forward<E1>(e1)) {}
 
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
-        return cast<U>(e1(cinput, index, vec_t<T, N>()));
+        return e1(cinput, index, vec_t<T, N>());
     }
     E1 e1;
 };
@@ -283,10 +282,10 @@ struct expressoin_sized : input_expression
     expressoin_sized(E1&& e1, size_t size) : e1(std::forward<E1>(e1)), m_size(size) {}
 
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         auto val = e1(cinput, index, vec_t<T, N>());
-        return cast<U>(val);
+        return val;
     }
 
     constexpr size_t size() const noexcept { return m_size; }
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -46,7 +46,7 @@ using flt_type = conditional<std::is_floating_point<deep_subtype<T>>::value, T, 
 
 namespace intrinsics
 {
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
 using f32sse = vec<f32, 4>;
 using f64sse = vec<f64, 2>;
 using i8sse  = vec<i8, 16>;
@@ -117,10 +117,10 @@ using mu64neon = mask<u64, 2>;
 template <cpu_t c, typename T>
 constexpr inline size_t next_simd_width(size_t n)
 {
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
     return n > vector_width<T, cpu_t::sse2> ? vector_width<T, c> : vector_width<T, cpu_t::sse2>;
 #endif
-#ifdef CID_ARCH_ARM
+#ifdef CMT_ARCH_ARM
     return vector_width<T, cpu_t::neon>;
 #endif
 }
diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp
@@ -25,7 +25,7 @@
 #include "log_exp.hpp"
 
 #pragma clang diagnostic push
-#if CID_HAS_WARNING("-Wc99-extensions")
+#if CMT_HAS_WARNING("-Wc99-extensions")
 #pragma clang diagnostic ignored "-Wc99-extensions"
 #endif
 
@@ -46,7 +46,7 @@ KFR_SINTRIN vec<T, N> gamma(const vec<T, N>& z)
 {
     constexpr size_t Count = arraysize(gamma_precalc<T>);
     vec<T, N> accm = gamma_precalc<T>[0];
-    KFR_LOOP_UNROLL
+    CMT_LOOP_UNROLL
     for (size_t k = 1; k < Count; k++)
         accm += gamma_precalc<T>[k] / (z + cast<utype<T>>(k));
     accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5);
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -41,9 +41,9 @@ struct generator : input_expression
     using type                    = T;
 
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N> t) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N> t) const
     {
-        return cast<U>(generate(t));
+        return generate(t);
     }
 
     void resync(T start) const { ptr_cast<Class>(this)->sync(start); }
@@ -65,7 +65,7 @@ protected:
     }
 
     template <size_t N, KFR_ENABLE_IF(N == width)>
-    KFR_INLINE vec<T, N> generate(vec_t<T, N>) const
+    CMT_INLINE vec<T, N> generate(vec_t<T, N>) const
     {
         const vec<T, N> result = value;
         call_next();
@@ -73,7 +73,7 @@ protected:
     }
 
     template <size_t N, KFR_ENABLE_IF(N < width)>
-    KFR_INLINE vec<T, N> generate(vec_t<T, N>) const
+    CMT_INLINE vec<T, N> generate(vec_t<T, N>) const
     {
         const vec<T, N> result = narrow<N>(value);
         shift(csize<N>);
@@ -81,7 +81,7 @@ protected:
     }
 
     template <size_t N, KFR_ENABLE_IF(N > width)>
-    KFR_INLINE vec<T, N> generate(vec_t<T, N> x) const
+    CMT_INLINE vec<T, N> generate(vec_t<T, N> x) const
     {
         const auto lo = generate(low(x));
         const auto hi = generate(high(x));
@@ -99,16 +99,16 @@ struct generator_linear : generator<T, width, generator_linear<T, width>>
         this->resync(start);
     }
 
-    KFR_INLINE void sync(T start) const noexcept { this->value = start + enumerate<T, width>() * step; }
+    CMT_INLINE void sync(T start) const noexcept { this->value = start + enumerate<T, width>() * step; }
 
-    KFR_INLINE void next() const noexcept { this->value += vstep; }
+    CMT_INLINE void next() const noexcept { this->value += vstep; }
 
 protected:
     T step;
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
 struct generator_exp : generator<T, width, generator_exp<T, width>>
 {
     generator_exp(T start, T step) noexcept : step(step), vstep(exp(make_vector(step* width))[0] - 1)
@@ -116,16 +116,16 @@ struct generator_exp : generator<T, width, generator_exp<T, width>>
         this->resync(start);
     }
 
-    KFR_INLINE void sync(T start) const noexcept { this->value = exp(start + enumerate<T, width>() * step); }
+    CMT_INLINE void sync(T start) const noexcept { this->value = exp(start + enumerate<T, width>() * step); }
 
-    KFR_INLINE void next() const noexcept { this->value += this->value * vstep; }
+    CMT_INLINE void next() const noexcept { this->value += this->value * vstep; }
 
 protected:
     T step;
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
 struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
 {
     generator_exp2(T start, T step) noexcept : step(step), vstep(exp2(make_vector(step* width))[0] - 1)
@@ -133,16 +133,16 @@ struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
         this->resync(start);
     }
 
-    KFR_INLINE void sync(T start) const noexcept { this->value = exp2(start + enumerate<T, width>() * step); }
+    CMT_INLINE void sync(T start) const noexcept { this->value = exp2(start + enumerate<T, width>() * step); }
 
-    KFR_INLINE void next() const noexcept { this->value += this->value * vstep; }
+    CMT_INLINE void next() const noexcept { this->value += this->value * vstep; }
 
 protected:
     T step;
     T vstep;
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP>
 struct generator_cossin : generator<T, width, generator_cossin<T, width>>
 {
     generator_cossin(T start, T step)
@@ -150,9 +150,9 @@ struct generator_cossin : generator<T, width, generator_cossin<T, width>>
     {
         this->resync(start);
     }
-    KFR_INLINE void sync(T start) const noexcept { this->value = init_cossin(step, start); }
+    CMT_INLINE void sync(T start) const noexcept { this->value = init_cossin(step, start); }
 
-    KFR_INLINE void next() const noexcept
+    CMT_INLINE void next() const noexcept
     {
         this->value = this->value - subadd(alpha * this->value, beta * swap<2>(this->value));
     }
@@ -161,13 +161,13 @@ protected:
     T step;
     T alpha;
     T beta;
-    KFR_NOINLINE static vec<T, width> init_cossin(T w, T phase)
+    CMT_NOINLINE static vec<T, width> init_cossin(T w, T phase)
     {
         return cossin(dup(phase + enumerate<T, width / 2>() * w));
     }
 };
 
-template <typename T, size_t width = get_vector_width<T, cpu_t::native>(2, 4)>
+template <typename T, size_t width = get_vector_width<T, cpu_t::native>(2, 4), KFR_ARCH_DEP>
 struct generator_sin : generator<T, width, generator_sin<T, width>>
 {
     generator_sin(T start, T step)
@@ -175,14 +175,14 @@ struct generator_sin : generator<T, width, generator_sin<T, width>>
     {
         this->resync(start);
     }
-    KFR_INLINE void sync(T start) const noexcept
+    CMT_INLINE void sync(T start) const noexcept
     {
         const vec<T, width* 2> cs = splitpairs(cossin(dup(start + enumerate<T, width>() * step)));
         this->cos_value = low(cs);
         this->value     = high(cs);
     }
 
-    KFR_INLINE void next() const noexcept
+    CMT_INLINE void next() const noexcept
     {
         const vec<T, width> c = this->cos_value;
         const vec<T, width> s = this->value;
diff --git a/include/kfr/base/intrinsics.h b/include/kfr/base/intrinsics.h
@@ -2,13 +2,13 @@
 
 #include "kfr.h"
 
-#ifdef CID_ARCH_SSE2
+#ifdef CMT_ARCH_SSE2
 #include <immintrin.h>
-#ifdef KFR_OS_WIN
+#ifdef CMT_OS_WIN
 #include <intrin.h>
 #endif
 #endif
 
-#ifdef CID_ARCH_NEON
+#ifdef CMT_ARCH_NEON
 #include <arm_neon.h>
 #endif
diff --git a/include/kfr/base/kfr.h b/include/kfr/base/kfr.h
@@ -5,51 +5,6 @@
 
 #include "../cident.h"
 
-#define KFR_INLINE CID_INLINE
-#define KFR_INLINE_MEMBER CID_INLINE_MEMBER
-#define KFR_INLINE_LAMBDA CID_INLINE_LAMBDA
-#define KFR_NOINLINE CID_NOINLINE
-#define KFR_FLATTEN CID_FLATTEN
-#define KFR_RESTRICT CID_RESTRICT
-
-#ifdef CID_COMPILER_CLANG
-#define KFR_COMPILER_CLANG CID_COMPILER_CLANG
-#endif
-
-#ifdef CID_OS_WIN
-#define KFR_OS_WIN CID_OS_WIN
-#endif
-
-#ifdef CID_OS_OSX
-#define KFR_OS_OSX CID_OS_OSX
-#endif
-
-#ifdef CID_OS_LINUX
-#define KFR_OS_LINUX CID_OS_LINUX
-#endif
-
-#ifdef CID_GNU_ATTRIBUTES
-#define KFR_GNU_ATTRIBUTES CID_GNU_ATTRIBUTES
-#endif
-
-#ifdef CID_MSVC_ATTRIBUTES
-#define KFR_MSVC_ATTRIBUTES CID_MSVC_ATTRIBUTES
-#endif
-
-#ifdef CID_ARCH_X64
-#define KFR_ARCH_X64 CID_ARCH_X64
-#endif
-
-#ifdef CID_ARCH_X32
-#define KFR_ARCH_X32 CID_ARCH_X32
-#endif
-
-#define KFR_ARCH_NAME CID_ARCH_NAME
-
-#define KFR_CDECL CID_CDECL
-
-#define KFR_PUBLIC_C CID_PUBLIC_C
-
 #ifdef __cplusplus
 namespace kfr
 {
@@ -74,59 +29,5 @@ constexpr int version                 = KFR_VERSION;
 }
 #endif
 
-//#define KFR_MEMORY_ALIGNMENT 64
-
-#if KFR_COMPILER_CLANG
-#define KFR_LOOP_NOUNROLL                                                                                    \
-    _Pragma("clang loop vectorize( disable )") _Pragma("clang loop interleave( disable )")                   \
-        _Pragma("clang loop unroll( disable )")
-
-#define KFR_LOOP_UNROLL _Pragma("clang loop unroll( full )")
-
-#define KFR_VEC_CC __attribute__((vectorcall))
-#else
-#define KFR_LOOP_NOUNROLL
-#define KFR_LOOP_UNROLL
-#ifdef KFR_COMPILER_MSVC
-#define KFR_VEC_CC __vectorcall
-#endif
-
-#endif
-
-#define KFR_AVAIL_AVX2 1
-#define KFR_AVAIL_AVX 1
-#define KFR_AVAIL_SSE42 1
-#define KFR_AVAIL_SSE41 1
-#define KFR_AVAIL_SSSE3 1
-#define KFR_AVAIL_SSE3 1
-#define KFR_AVAIL_SSE2 1
-#define KFR_AVAIL_SSE 1
-
-#if defined(KFR_GNU_ATTRIBUTES)
-
-#define KFR_CPU_NAME_avx2 "avx2"
-#define KFR_CPU_NAME_avx "avx"
-#define KFR_CPU_NAME_sse42 "sse4.2"
-#define KFR_CPU_NAME_sse41 "sse4.1"
-#define KFR_CPU_NAME_ssse3 "ssse3"
-#define KFR_CPU_NAME_sse3 "sse3"
-#define KFR_CPU_NAME_sse2 "sse2"
-
-#define KFR_USE_CPU(arch) __attribute__((target(KFR_CPU_NAME_##arch)))
-
-#else
-#define KFR_USE_CPU(arch)
-#endif
-
-#if defined(KFR_GNU_ATTRIBUTES)
-#define KFR_FAST_CC __attribute__((fastcall))
-#else
-#define KFR_FAST_CC __fastcall
-#endif
-
-#define KFR_INTRIN CID_INTRIN
-#define KFR_SINTRIN CID_INTRIN CID_NODEBUG static
-#define KFR_AINTRIN inline CID_NODEBUG static
-#define KFR_FAST_NOINLINE CID_NOINLINE
-
-#define KFR_CPU_INTRIN(c) KFR_AINTRIN KFR_USE_CPU(c)
+#define KFR_INTRIN CMT_INTRIN
+#define KFR_SINTRIN CMT_INTRIN static
diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp
@@ -81,7 +81,7 @@ KFR_SINTRIN vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q)
 template <typename T, size_t N>
 KFR_SINTRIN vec<T, N> logb(const vec<T, N>& x)
 {
-    return select(x == T(), -c_infinity<T>, cast<T>(vilogbp1(x) - 1));
+    return select(x == T(), -c_infinity<T>, static_cast<vec<T, N>>(vilogbp1(x) - 1));
 }
 
 template <size_t N>
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -46,9 +46,9 @@ struct bitmask
     type value;
 };
 
-#if defined CID_ARCH_SSE2
+#if defined CMT_ARCH_SSE2
 
-#if defined CID_ARCH_SSE41
+#if defined CMT_ARCH_SSE41
 
 KFR_SINTRIN bool bittestany(const u8sse& x) { return !_mm_testz_si128(*x, *x); }
 KFR_SINTRIN bool bittestany(const u16sse& x) { return !_mm_testz_si128(*x, *x); }
@@ -69,7 +69,7 @@ KFR_SINTRIN bool bittestall(const i32sse& x) { return _mm_testc_si128(*x, *allon
 KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
 #endif
 
-#if defined CID_ARCH_AVX
+#if defined CMT_ARCH_AVX
 KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_ps(*x, *x); }
 KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_pd(*x, *x); }
 KFR_SINTRIN bool bittestall(const f32sse& x) { return _mm_testc_ps(*x, *allonesvector(x)); }
@@ -98,7 +98,7 @@ KFR_SINTRIN bool bittestall(const i8avx& x) { return _mm256_testc_si256(*x, *all
 KFR_SINTRIN bool bittestall(const i16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
 KFR_SINTRIN bool bittestall(const i32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
 KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-#elif defined CID_ARCH_SSE41
+#elif defined CMT_ARCH_SSE41
 KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
 KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
 KFR_SINTRIN bool bittestall(const f32sse& x)
@@ -111,7 +111,7 @@ KFR_SINTRIN bool bittestall(const f64sse& x)
 }
 #endif
 
-#if !defined CID_ARCH_SSE41
+#if !defined CMT_ARCH_SSE41
 
 KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); }
 KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); }
@@ -158,6 +158,59 @@ KFR_SINTRIN bool bittestany(const vec<T, N>& a)
     return bittestany(low(a)) || bittestany(high(a));
 }
 
+#elif CMT_ARCH_NEON
+
+KFR_SINTRIN bool bittestall(const u32neon& a)
+{
+    const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a));
+    return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
+}
+
+KFR_SINTRIN bool bittestany(const u32neon& a)
+{
+    const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a));
+    return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
+}
+KFR_SINTRIN bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); }
+
+KFR_SINTRIN bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_SINTRIN bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+KFR_SINTRIN bool bittestall(const vec<T, N>& a)
+{
+    return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+KFR_SINTRIN bool bittestall(const vec<T, N>& a)
+{
+    return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+KFR_SINTRIN bool bittestany(const vec<T, N>& a)
+{
+    return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+KFR_SINTRIN bool bittestany(const vec<T, N>& a)
+{
+    return bittestany(low(a)) || bittestany(high(a));
+}
+
 #else
 
 template <typename T, size_t N>
diff --git a/include/kfr/base/memory.hpp b/include/kfr/base/memory.hpp
@@ -83,15 +83,15 @@ inline void aligned_free(void* ptr)
 }
 
 template <typename T = void, size_t alignment = native_cache_alignment>
-KFR_INLINE T* aligned_allocate(size_t size = 1)
+CMT_INLINE T* aligned_allocate(size_t size = 1)
 {
-    T* ptr = static_cast<T*>(__builtin_assume_aligned(
+    T* ptr = static_cast<T*>(CMT_ASSUME_ALIGNED(
         internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>), alignment), alignment));
     return ptr;
 }
 
 template <typename T = void>
-KFR_INLINE void aligned_deallocate(T* ptr)
+CMT_INLINE void aligned_deallocate(T* ptr)
 {
     return internal::aligned_free(ptr);
 }
@@ -101,29 +101,29 @@ namespace internal
 template <typename T>
 struct aligned_deleter
 {
-    KFR_INLINE void operator()(T* ptr) const { aligned_deallocate(ptr); }
+    CMT_INLINE void operator()(T* ptr) const { aligned_deallocate(ptr); }
 };
 }
 
 template <typename T>
 struct autofree
 {
-    KFR_INLINE autofree() {}
-    explicit KFR_INLINE autofree(size_t size) : ptr(aligned_allocate<T>(size)) {}
+    CMT_INLINE autofree() {}
+    explicit CMT_INLINE autofree(size_t size) : ptr(aligned_allocate<T>(size)) {}
     autofree(const autofree&) = delete;
     autofree& operator=(const autofree&) = delete;
     autofree(autofree&&) noexcept        = default;
     autofree& operator=(autofree&&) noexcept = default;
-    KFR_INLINE T& operator[](size_t index) noexcept { return ptr[index]; }
-    KFR_INLINE const T& operator[](size_t index) const noexcept { return ptr[index]; }
+    CMT_INLINE T& operator[](size_t index) noexcept { return ptr[index]; }
+    CMT_INLINE const T& operator[](size_t index) const noexcept { return ptr[index]; }
 
     template <typename U = T>
-    KFR_INLINE U* data() noexcept
+    CMT_INLINE U* data() noexcept
     {
         return ptr_cast<U>(ptr.get());
     }
     template <typename U = T>
-    KFR_INLINE const U* data() const noexcept
+    CMT_INLINE const U* data() const noexcept
     {
         return ptr_cast<U>(ptr.get());
     }
@@ -159,7 +159,7 @@ struct allocator
     {
         pointer result = aligned_allocate<value_type>(n);
         if (!result)
-            CID_THROW(std::bad_alloc());
+            CMT_THROW(std::bad_alloc());
         return result;
     }
     void deallocate(pointer p, size_type) { aligned_deallocate(p); }
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -33,7 +33,7 @@ namespace kfr
 namespace intrinsics
 {
 
-#if defined CID_ARCH_SSE2
+#if defined CMT_ARCH_SSE2
 
 KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); }
 KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); }
@@ -49,7 +49,7 @@ KFR_SINTRIN i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(
 KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
 KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
 
-#if defined CID_ARCH_AVX2
+#if defined CMT_ARCH_AVX2
 KFR_SINTRIN u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(*x, *y); }
 KFR_SINTRIN i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(*x, *y); }
 KFR_SINTRIN i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(*x, *y); }
@@ -70,14 +70,14 @@ KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, 
 KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
 #endif
 
-#if defined CID_ARCH_AVX
+#if defined CMT_ARCH_AVX
 KFR_SINTRIN f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(*x, *y); }
 KFR_SINTRIN f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(*x, *y); }
 KFR_SINTRIN f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(*x, *y); }
 KFR_SINTRIN f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(*x, *y); }
 #endif
 
-#if defined CID_ARCH_SSE41
+#if defined CMT_ARCH_SSE41
 KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(*x, *y); }
 KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(*x, *y); }
 KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(*x, *y); }
@@ -103,6 +103,37 @@ KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, 
 KFR_HANDLE_ALL_SIZES_2(min)
 KFR_HANDLE_ALL_SIZES_2(max)
 
+#elif defined CMT_ARCH_NEON
+
+KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); }
+KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); }
+KFR_SINTRIN i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(*x, *y); }
+KFR_SINTRIN u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(*x, *y); }
+KFR_SINTRIN i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(*x, *y); }
+KFR_SINTRIN u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(*x, *y); }
+
+KFR_SINTRIN i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(*x, *y); }
+KFR_SINTRIN u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(*x, *y); }
+KFR_SINTRIN i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(*x, *y); }
+KFR_SINTRIN u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(*x, *y); }
+KFR_SINTRIN i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(*x, *y); }
+KFR_SINTRIN u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(*x, *y); }
+KFR_SINTRIN i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
+KFR_SINTRIN u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
+
+KFR_SINTRIN f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(*x, *y); }
+KFR_SINTRIN f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(*x, *y); }
+#if defined CMT_ARCH_NEON64
+KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(*x, *y); }
+KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(*x, *y); }
+#else
+KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
+KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
 #else
 
 // fallback
diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp
@@ -25,7 +25,7 @@
 #include "log_exp.hpp"
 
 #pragma clang diagnostic push
-#if CID_HAS_WARNING("-Wc99-extensions")
+#if CMT_HAS_WARNING("-Wc99-extensions")
 #pragma clang diagnostic ignored "-Wc99-extensions"
 #endif
 
@@ -77,7 +77,7 @@ constexpr T bessel_coef[] = { T(0.25),
                               T(1.5021381070956226783e-096) };
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
+CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
 {
     const vec<T, N> x_2     = x * 0.5;
     const vec<T, N> x_2_sqr = x_2 * x_2;
@@ -85,7 +85,7 @@ KFR_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
     vec<T, N> result;
     result = 1 + x_2_sqr;
 
-    KFR_LOOP_UNROLL
+    CMT_LOOP_UNROLL
     for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
     {
         result = fmadd((num *= x_2_sqr), bessel_coef<T>[i], result);
diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp
@@ -32,18 +32,18 @@ namespace internal
 {
 
 template <typename T, typename ReduceFn>
-KFR_INLINE T horizontal_impl(const vec<T, 1>& value, ReduceFn&&)
+CMT_INLINE T horizontal_impl(const vec<T, 1>& value, ReduceFn&&)
 {
     return T(value[0]);
 }
 
 template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))>
-KFR_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
+CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
 {
     return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce));
 }
 template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))>
-KFR_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
+CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
 {
     const T initial = reduce(initialvalue<T>());
     return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce));
@@ -51,7 +51,7 @@ KFR_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
 }
 
 template <typename T, size_t N, typename ReduceFn>
-KFR_INLINE T horizontal(const vec<T, N>& value, ReduceFn&& reduce)
+CMT_INLINE T horizontal(const vec<T, N>& value, ReduceFn&& reduce)
 {
     return internal::horizontal_impl(value, std::forward<ReduceFn>(reduce));
 }
@@ -74,16 +74,14 @@ constexpr inline T add(initialvalue<T>)
 KFR_FN(add)
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE internal::expression_function<fn_add, E1, E2> add(E1&& x, E2&& y)
+CMT_INLINE internal::expression_function<fn_add, E1, E2> add(E1&& x, E2&& y)
 {
     return { fn_add(), std::forward<E1>(x), std::forward<E2>(y) };
 }
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE internal::expression_function<fn_add, E1> add(E1&& x, E2&& y, E3&& z)
+CMT_INLINE internal::expression_function<fn_add, E1> add(E1&& x, E2&& y, E3&& z)
 {
-    return { fn_add(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z)
-
-    };
+    return { fn_add(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
 }
 
 template <typename T1, typename T2>
@@ -99,11 +97,9 @@ constexpr inline T sub(initialvalue<T>)
 KFR_FN(sub)
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE internal::expression_function<fn_sub, E1, E2> sub(E1&& x, E2&& y)
+CMT_INLINE internal::expression_function<fn_sub, E1, E2> sub(E1&& x, E2&& y)
 {
-    return { fn_sub(), std::forward<E1>(x), std::forward<E2>(y)
-
-    };
+    return { fn_sub(), std::forward<E1>(x), std::forward<E2>(y) };
 }
 
 template <typename T1>
@@ -124,12 +120,12 @@ constexpr inline T mul(initialvalue<T>)
 }
 KFR_FN(mul)
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE internal::expression_function<fn_mul, E1, E2> mul(E1&& x, E2&& y)
+CMT_INLINE internal::expression_function<fn_mul, E1, E2> mul(E1&& x, E2&& y)
 {
     return { fn_mul(), std::forward<E1>(x), std::forward<E2>(y) };
 }
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE internal::expression_function<fn_mul, E1> mul(E1&& x, E2&& y, E3&& z)
+CMT_INLINE internal::expression_function<fn_mul, E1> mul(E1&& x, E2&& y, E3&& z)
 {
     return { fn_mul(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
 }
@@ -141,7 +137,7 @@ constexpr inline T1 sqr(T1 x)
 }
 KFR_FN(sqr)
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_sqr, E1> sqr(E1&& x)
+CMT_INLINE internal::expression_function<fn_sqr, E1> sqr(E1&& x)
 {
     return { fn_sqr(), std::forward<E1>(x) };
 }
@@ -154,11 +150,9 @@ constexpr inline T1 cub(T1 x)
 KFR_FN(cub)
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_cub, E1> cub(E1&& x)
+CMT_INLINE internal::expression_function<fn_cub, E1> cub(E1&& x)
 {
-    return { fn_cub(), std::forward<E1>(x)
-
-    };
+    return { fn_cub(), std::forward<E1>(x) };
 }
 
 template <typename T>
@@ -190,32 +184,24 @@ KFR_FN(pow4)
 KFR_FN(pow5)
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_pow2, E1> pow2(E1&& x)
+CMT_INLINE internal::expression_function<fn_pow2, E1> pow2(E1&& x)
 {
-    return { fn_pow2(), std::forward<E1>(x)
-
-    };
+    return { fn_pow2(), std::forward<E1>(x) };
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_pow3, E1> pow3(E1&& x)
+CMT_INLINE internal::expression_function<fn_pow3, E1> pow3(E1&& x)
 {
-    return { fn_pow3(), std::forward<E1>(x)
-
-    };
+    return { fn_pow3(), std::forward<E1>(x) };
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_pow4, E1> pow4(E1&& x)
+CMT_INLINE internal::expression_function<fn_pow4, E1> pow4(E1&& x)
 {
-    return { fn_pow4(), std::forward<E1>(x)
-
-    };
+    return { fn_pow4(), std::forward<E1>(x) };
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INLINE internal::expression_function<fn_pow5, E1> pow5(E1&& x)
+CMT_INLINE internal::expression_function<fn_pow5, E1> pow5(E1&& x)
 {
-    return { fn_pow5(), std::forward<E1>(x)
-
-    };
+    return { fn_pow5(), std::forward<E1>(x) };
 }
 
 /// Raise x to the power base $x^{base}$
@@ -239,7 +225,7 @@ constexpr inline T ipow(T x, int base)
 KFR_FN(ipow)
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE internal::expression_function<fn_ipow, E1, E2> ipow(E1&& x, E2&& b)
+CMT_INLINE internal::expression_function<fn_ipow, E1, E2> ipow(E1&& x, E2&& b)
 {
     return { fn_ipow(), std::forward<E1>(x), std::forward<E2>(b)
 
@@ -265,24 +251,24 @@ KFR_FN(sqrsum)
 KFR_FN(sqrdiff)
 
 /// Division
-template <typename T1, typename T2>
-inline common_type<T1, T2> div(T1 x, T2 y)
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+inline Tout div(const T1& x, const T2& y)
 {
-    return x / y;
+    return static_cast<Tout>(x) / static_cast<Tout>(y);
 }
 KFR_FN(div)
 
 /// Remainder
-template <typename T1, typename T2>
-inline common_type<T1, T2> rem(T1 x, T2 y)
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+inline Tout rem(const T1& x, const T2& y)
 {
-    return x % y;
+    return static_cast<Tout>(x) % static_cast<Tout>(y);
 }
 KFR_FN(rem)
 
 /// Negation
 template <typename T1>
-inline T1 neg(T1 x)
+inline T1 neg(const T1& x)
 {
     return -x;
 }
@@ -290,7 +276,7 @@ KFR_FN(neg)
 
 /// Bitwise Not
 template <typename T1>
-inline T1 bitwisenot(T1 x)
+inline T1 bitwisenot(const T1& x)
 {
     return ~x;
 }
@@ -453,13 +439,13 @@ namespace internal
 {
 
 template <typename T1, typename T2>
-constexpr KFR_INLINE T1 horner(T1, T2 c0)
+constexpr CMT_INLINE T1 horner(T1, T2 c0)
 {
     return c0;
 }
 
 template <typename T1, typename T2, typename T3, typename... Ts>
-constexpr KFR_INLINE T1 horner(T1 x, T2 c0, T3 c1, Ts... values)
+constexpr CMT_INLINE T1 horner(T1 x, T2 c0, T3 c1, Ts... values)
 {
     return fmadd(horner(x, c1, values...), x, c0);
 }
@@ -469,7 +455,7 @@ constexpr KFR_INLINE T1 horner(T1 x, T2 c0, T3 c1, Ts... values)
 ///
 /// ``horner(x, 1, 2, 3)`` is equivalent to \(3x^2 + 2x + 1\)
 template <typename T1, typename... Ts>
-constexpr KFR_INLINE T1 horner(T1 x, Ts... c)
+constexpr CMT_INLINE T1 horner(T1 x, Ts... c)
 {
     return internal::horner(x, c...);
 }
@@ -478,7 +464,7 @@ KFR_FN(horner)
 /// Calculate Multiplicative Inverse of `x`
 /// Returns `1/x`
 template <typename T>
-constexpr KFR_INLINE T reciprocal(T x)
+constexpr CMT_INLINE T reciprocal(T x)
 {
     static_assert(std::is_floating_point<subtype<T>>::value, "T must be floating point type");
     return subtype<T>(1) / x;
@@ -486,7 +472,7 @@ constexpr KFR_INLINE T reciprocal(T x)
 KFR_FN(reciprocal)
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> mulsign(const vec<T, N>& x, const vec<T, N>& y)
+CMT_INLINE vec<T, N> mulsign(const vec<T, N>& x, const vec<T, N>& y)
 {
     return x ^ (y & internal::highbitmask<T>);
 }
@@ -494,85 +480,65 @@ KFR_FN_S(mulsign)
 KFR_FN(mulsign)
 
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
+constexpr CMT_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
 {
     return (x & internal::highbitmask<T>) | (y & internal::highbitmask<T>);
 }
 
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_INLINE vec<T, N> fmod(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x - cast<itype<T>>(x / y) * y;
-}
-
-KFR_FN_S(fmod)
-KFR_FN(fmod)
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-constexpr KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x % y;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return fmod(x, y);
-}
-
 template <typename T, size_t N>
-KFR_INLINE mask<T, N> isnan(const vec<T, N>& x)
+CMT_INLINE mask<T, N> isnan(const vec<T, N>& x)
 {
     return x != x;
 }
 
 template <typename T, size_t N>
-KFR_INLINE mask<T, N> isinf(const vec<T, N>& x)
+CMT_INLINE mask<T, N> isinf(const vec<T, N>& x)
 {
     return x == c_infinity<T> || x == -c_infinity<T>;
 }
 
 template <typename T, size_t N>
-KFR_INLINE mask<T, N> isfinite(const vec<T, N>& x)
+CMT_INLINE mask<T, N> isfinite(const vec<T, N>& x)
 {
     return !isnan(x) && !isinf(x);
 }
 
 template <typename T, size_t N>
-KFR_INLINE mask<T, N> isnegative(const vec<T, N>& x)
+CMT_INLINE mask<T, N> isnegative(const vec<T, N>& x)
 {
     return (x & internal::highbitmask<T>) != 0;
 }
 
 template <typename T, size_t N>
-KFR_INLINE mask<T, N> ispositive(const vec<T, N>& x)
+CMT_INLINE mask<T, N> ispositive(const vec<T, N>& x)
 {
     return !isnegative(x);
 }
 
 template <typename T, size_t N>
-KFR_INLINE mask<T, N> iszero(const vec<T, N>& x)
+CMT_INLINE mask<T, N> iszero(const vec<T, N>& x)
 {
     return x == T();
 }
 
 /// Swap byte order
 template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)>
-KFR_INLINE vec<T, N> swapbyteorder(const vec<T, N>& x)
+CMT_INLINE vec<T, N> swapbyteorder(const vec<T, N>& x)
 {
     return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x)));
 }
 template <typename T, KFR_ENABLE_IF(sizeof(T) == 8)>
-KFR_INLINE T swapbyteorder(T x)
+CMT_INLINE T swapbyteorder(T x)
 {
     return reinterpret_cast<const T&>(__builtin_bswap64(reinterpret_cast<const u64&>(x)));
 }
 template <typename T, KFR_ENABLE_IF(sizeof(T) == 4)>
-KFR_INLINE T swapbyteorder(T x)
+CMT_INLINE T swapbyteorder(T x)
 {
     return reinterpret_cast<const T&>(__builtin_bswap32(reinterpret_cast<const u32&>(x)));
 }
 template <typename T, KFR_ENABLE_IF(sizeof(T) == 2)>
-KFR_INLINE T swapbyteorder(T x)
+CMT_INLINE T swapbyteorder(T x)
 {
     return reinterpret_cast<const T&>(__builtin_bswap16(reinterpret_cast<const u16&>(x)));
 }
@@ -580,7 +546,7 @@ KFR_FN(swapbyteorder)
 
 /// Sum all elements of the vector
 template <typename T, size_t N>
-KFR_INLINE T hadd(const vec<T, N>& value)
+CMT_INLINE T hadd(const vec<T, N>& value)
 {
     return horizontal(value, fn_add());
 }
@@ -588,26 +554,26 @@ KFR_FN(hadd)
 
 /// Multiply all elements of the vector
 template <typename T, size_t N>
-KFR_INLINE T hmul(const vec<T, N>& value)
+CMT_INLINE T hmul(const vec<T, N>& value)
 {
     return horizontal(value, fn_mul());
 }
 KFR_FN(hmul)
 
 template <typename T, size_t N>
-KFR_INLINE T hbitwiseand(const vec<T, N>& value)
+CMT_INLINE T hbitwiseand(const vec<T, N>& value)
 {
     return horizontal(value, fn_bitwiseand());
 }
 KFR_FN(hbitwiseand)
 template <typename T, size_t N>
-KFR_INLINE T hbitwiseor(const vec<T, N>& value)
+CMT_INLINE T hbitwiseor(const vec<T, N>& value)
 {
     return horizontal(value, fn_bitwiseor());
 }
 KFR_FN(hbitwiseor)
 template <typename T, size_t N>
-KFR_INLINE T hbitwisexor(const vec<T, N>& value)
+CMT_INLINE T hbitwisexor(const vec<T, N>& value)
 {
     return horizontal(value, fn_bitwisexor());
 }
@@ -615,7 +581,7 @@ KFR_FN(hbitwisexor)
 
 /// Calculate the Dot-Product of two vectors
 template <typename T, size_t N>
-KFR_INLINE T dot(const vec<T, N>& x, const vec<T, N>& y)
+CMT_INLINE T dot(const vec<T, N>& x, const vec<T, N>& y)
 {
     return hadd(x * y);
 }
@@ -623,7 +589,7 @@ KFR_FN(dot)
 
 /// Calculate the Arithmetic mean of all elements in the vector
 template <typename T, size_t N>
-KFR_INLINE T avg(const vec<T, N>& value)
+CMT_INLINE T avg(const vec<T, N>& value)
 {
     return hadd(value) / N;
 }
@@ -631,19 +597,19 @@ KFR_FN(avg)
 
 /// Calculate the RMS of all elements in the vector
 template <typename T, size_t N>
-KFR_INLINE T rms(const vec<T, N>& value)
+CMT_INLINE T rms(const vec<T, N>& value)
 {
     return internal::builtin_sqrt(hadd(value * value) / N);
 }
 KFR_FN(rms)
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b)
+CMT_INLINE vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b)
 {
     return blend<1, 0>(a + b, a - b);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b)
+CMT_INLINE vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b)
 {
     return blend<0, 1>(a + b, a - b);
 }
@@ -651,26 +617,26 @@ KFR_FN(subadd)
 KFR_FN(addsub)
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> negeven(const vec<T, N>& x)
+CMT_INLINE vec<T, N> negeven(const vec<T, N>& x)
 {
     return x ^ broadcast<N>(-T(), T());
 }
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> negodd(const vec<T, N>& x)
+CMT_INLINE vec<T, N> negodd(const vec<T, N>& x)
 {
     return x ^ broadcast<N>(T(), -T());
 }
 
 #define KFR_EXPR_UNARY(fn, op)                                                                               \
     template <typename A1, KFR_ENABLE_IF(is_input_expression<A1>::value)>                                    \
-    KFR_INLINE auto operator op(A1&& a1)->decltype(bind_expression(fn(), std::forward<A1>(a1)))              \
+    CMT_INLINE auto operator op(A1&& a1)->decltype(bind_expression(fn(), std::forward<A1>(a1)))              \
     {                                                                                                        \
         return bind_expression(fn(), std::forward<A1>(a1));                                                  \
     }
 
 #define KFR_EXPR_BINARY(fn, op)                                                                              \
     template <typename A1, typename A2, KFR_ENABLE_IF(is_input_expressions<A1, A2>::value)>                  \
-    KFR_INLINE auto operator op(A1&& a1, A2&& a2)                                                            \
+    CMT_INLINE auto operator op(A1&& a1, A2&& a2)                                                            \
         ->decltype(bind_expression(fn(), std::forward<A1>(a1), std::forward<A2>(a2)))                        \
     {                                                                                                        \
         return bind_expression(fn(), std::forward<A1>(a1), std::forward<A2>(a2));                            \
@@ -695,4 +661,54 @@ KFR_EXPR_BINARY(fn_less, <)
 KFR_EXPR_BINARY(fn_greater, >)
 KFR_EXPR_BINARY(fn_lessorequal, <=)
 KFR_EXPR_BINARY(fn_greaterorequal, >=)
+#undef KFR_EXPR_UNARY
+#undef KFR_EXPR_BINARY
+
+template <typename T, size_t N1, size_t... Ns>
+vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest)
+{
+    const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...));
+    return compcast<vec<T, sizeof...(Ns) + 1>>(t);
+}
+
+KFR_FN(packtranspose)
+
+namespace internal
+{
+template <typename... E>
+struct expression_pack : expression<E...>, output_expression
+{
+    constexpr static size_t count = sizeof...(E);
+
+    expression_pack(E&&... e) : expression<E...>(std::forward<E>(e)...) {}
+    using value_type = vec<common_type<value_type_of<E>...>, count>;
+    using size_type  = typename expression<E...>::size_type;
+    constexpr size_type size() const noexcept { return expression<E...>::size(); }
+
+    template <typename U, size_t N>
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
+    {
+        return this->call(fn_packtranspose(), index, x);
+    }
+    template <typename U, size_t N>
+    CMT_INLINE void operator()(coutput_t, size_t index, const vec<vec<U, count>, N>& x)
+    {
+        output(index, x, csizeseq<count>);
+    }
+
+private:
+    template <typename U, size_t N, size_t... indices>
+    void output(size_t index, const vec<vec<U, count>, N>& x, csizes_t<indices...>)
+    {
+        const vec<vec<U, N>, count> xx = compcast<vec<U, N>>(transpose<count>(flatten(x)));
+        swallow{ (std::get<indices>(this->args)(coutput, index, xx[indices]), void(), 0)... };
+    }
+};
+}
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+internal::expression_pack<internal::arg<E>...> pack(E&&... e)
+{
+    return internal::expression_pack<internal::arg<E>...>(std::forward<E>(e)...);
+}
 }
diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp
@@ -32,7 +32,7 @@ namespace kfr
 constexpr size_t maximum_expression_width() { return bitness_const(16, 32); }
 
 template <typename T, size_t maxwidth = maximum_expression_width()>
-using expression_vtable = carray<void*, 2 + ilog2(maxwidth) + 1>;
+using expression_vtable = std::array<void*, 2 + ilog2(maxwidth) + 1>;
 
 struct dummy_content
 {
@@ -74,27 +74,27 @@ struct expression_pointer : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         using func_t = simd<T, N> (*)(void*, size_t);
 
         static_assert(is_poweroftwo(N), "N must be a power of two");
         constexpr size_t findex = ilog2(N);
         static_assert(N <= maxwidth, "N is greater than maxwidth");
-        func_t func = reinterpret_cast<func_t>(vtable->get(csize<2 + findex>));
-        vec<U, N> result = cast<U>(func(instance, index));
+        func_t func = reinterpret_cast<func_t>((*vtable)[2 + findex]);
+        vec<U, N> result = vec<T, N>(func(instance, index));
         return result;
     }
-    KFR_INLINE void begin_block(size_t size) const
+    CMT_INLINE void begin_block(size_t size) const
     {
         using func_t = void (*)(void*, size_t);
-        func_t func  = reinterpret_cast<func_t>(vtable->get(csize<0>));
+        func_t func  = reinterpret_cast<func_t>((*vtable)[0]);
         func(instance, size);
     }
-    KFR_INLINE void end_block(size_t size) const
+    CMT_INLINE void end_block(size_t size) const
     {
         using func_t = void (*)(void*, size_t);
-        func_t func  = reinterpret_cast<func_t>(vtable->get(csize<1>));
+        func_t func  = reinterpret_cast<func_t>((*vtable)[1]);
         func(instance, size);
     }
 
@@ -107,19 +107,21 @@ private:
 namespace internal
 {
 template <typename T, size_t N, typename Fn, typename Ret = simd<T, N>,
-          typename NonMemFn = Ret (*)(Fn*, size_t, vec_t<T, N>)>
-KFR_INLINE NonMemFn make_expression_func()
+          typename NonMemFn = Ret (*)(void*, size_t)>
+CMT_INLINE NonMemFn make_expression_func()
 {
-    return [](Fn* fn, size_t index, vec_t<T, N> x) { return *(fn->operator()(cinput, index, x)); };
+    return [](void* fn, size_t index) -> Ret {
+        return *(reinterpret_cast<Fn*>(fn)->operator()(cinput, index, vec_t<T, N>()));
+    };
 }
 
 template <typename Fn, typename NonMemFn = void (*)(Fn*, size_t)>
-KFR_INLINE NonMemFn make_expression_begin_block()
+CMT_INLINE NonMemFn make_expression_begin_block()
 {
     return [](Fn* fn, size_t size) { return fn->begin_block(size); };
 }
 template <typename Fn, typename NonMemFn = void (*)(Fn*, size_t)>
-KFR_INLINE NonMemFn make_expression_end_block()
+CMT_INLINE NonMemFn make_expression_end_block()
 {
     return [](Fn* fn, size_t size) { return fn->end_block(size); };
 }
@@ -130,19 +132,19 @@ expression_vtable<T, maxwidth> make_expression_vtable_impl()
     expression_vtable<T, maxwidth> result;
     constexpr size_t size = result.size() - 2;
 
-    result.get(csize<0>) = reinterpret_cast<void*>(&internal::make_expression_begin_block<decay<E>>);
-    result.get(csize<1>) = reinterpret_cast<void*>(&internal::make_expression_end_block<decay<E>>);
+    result[0] = reinterpret_cast<void*>(&internal::make_expression_begin_block<decay<E>>);
+    result[1] = reinterpret_cast<void*>(&internal::make_expression_end_block<decay<E>>);
 
     cforeach(csizeseq<size>, [&](auto u) {
-        constexpr size_t N = 1 << val_of(u);
-        result.get(csize<2 + val_of(u)>) =
+        constexpr size_t N = 1 << val_of(decltype(u)());
+        result[2 + val_of(decltype(u)())] =
             reinterpret_cast<void*>(internal::make_expression_func<T, N, decay<E>>());
     });
     return result;
 }
 
 template <typename T, size_t maxwidth, typename E>
-KFR_INLINE expression_vtable<T, maxwidth>* make_expression_vtable()
+CMT_INLINE expression_vtable<T, maxwidth>* make_expression_vtable()
 {
     static_assert(is_input_expression<E>::value, "E must be an expression");
     static expression_vtable<T, maxwidth> vtable = internal::make_expression_vtable_impl<T, maxwidth, E>();
@@ -151,7 +153,7 @@ KFR_INLINE expression_vtable<T, maxwidth>* make_expression_vtable()
 }
 
 template <typename E, typename T = value_type_of<E>, size_t maxwidth = maximum_expression_width()>
-KFR_INLINE expression_pointer<T, maxwidth> to_pointer(E& expr)
+CMT_INLINE expression_pointer<T, maxwidth> to_pointer(E& expr)
 {
     static_assert(is_input_expression<E>::value, "E must be an expression");
     return expression_pointer<T, maxwidth>(std::addressof(expr),
@@ -159,7 +161,7 @@ KFR_INLINE expression_pointer<T, maxwidth> to_pointer(E& expr)
 }
 
 template <typename E, typename T = value_type_of<E>, size_t maxwidth = maximum_expression_width()>
-KFR_INLINE expression_pointer<T, maxwidth> to_pointer(E&& expr)
+CMT_INLINE expression_pointer<T, maxwidth> to_pointer(E&& expr)
 {
     static_assert(is_input_expression<E>::value, "E must be an expression");
     std::shared_ptr<expression_resource> ptr = make_resource(std::move(expr));
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -114,8 +114,8 @@ inline enable_if_not_f<vec<T, N>> random_range(random_bit_generator& gen, T min,
     using big_type = findinttype<sqr(std::numeric_limits<T>::min()), sqr(std::numeric_limits<T>::max())>;
 
     vec<T, N> u                = random_uniform<T, N>(gen);
-    const vec<big_type, N> tmp = cast<big_type>(u);
-    return cast<T>((tmp * (max - min) + min) >> typebits<T>::bits);
+    const vec<big_type, N> tmp = u;
+    return (tmp * (max - min) + min) >> typebits<T>::bits;
 }
 
 namespace internal
@@ -128,7 +128,7 @@ struct expression_random_uniform : input_expression
     template <typename U, size_t N>
     vec<U, N> operator()(cinput_t, size_t, vec_t<U, N>) const
     {
-        return cast<U>(random_uniform<T, N>(gen));
+        return random_uniform<T, N>(gen);
     }
     mutable random_bit_generator gen;
 };
@@ -146,7 +146,7 @@ struct expression_random_range : input_expression
     template <typename U, size_t N>
     vec<U, N> operator()(cinput_t, size_t, vec_t<U, N>) const
     {
-        return cast<U>(random_range<N, T>(gen, min, max));
+        return random_range<N, T>(gen, min, max);
     }
     mutable random_bit_generator gen;
     const T min;
diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp
@@ -30,31 +30,31 @@ namespace kfr
 {
 
 template <size_t N, bool A = false, typename T>
-KFR_INLINE vec<T, N> read(const T* src)
+CMT_INLINE vec<T, N> read(const T* src)
 {
     return internal_read_write::read<N, A, T>(src);
 }
 
 template <bool A = false, size_t N, typename T>
-KFR_INLINE void write(T* dest, const vec<T, N>& value)
+CMT_INLINE void write(T* dest, const vec<T, N>& value)
 {
     internal_read_write::write<A, N, T>(dest, value);
 }
 
 template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
-KFR_INLINE vec<T, Nout> gather(const T* base, size_t index, Indices... indices)
+CMT_INLINE vec<T, Nout> gather(const T* base, size_t index, Indices... indices)
 {
     return make_vector(base[index], base[indices]...);
 }
 
 template <size_t Index, size_t... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
-KFR_INLINE vec<T, Nout> gather(const T* base)
+CMT_INLINE vec<T, Nout> gather(const T* base)
 {
     return make_vector(base[Index], base[Indices]...);
 }
 
 template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0>
-KFR_INLINE void scatter(const T* base, const vec<T, N>& value)
+CMT_INLINE void scatter(const T* base, const vec<T, N>& value)
 {
     base[Index] = value[InIndex];
     scatter<Indices..., T, N, InIndex + 1>(base, value);
@@ -63,60 +63,60 @@ KFR_INLINE void scatter(const T* base, const vec<T, N>& value)
 namespace internal
 {
 template <typename T, size_t N, size_t... Indices>
-KFR_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>)
+CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>)
 {
     return make_vector(base[indices[Indices]]...);
 }
 template <size_t Nout, size_t Stride, typename T, size_t... Indices>
-KFR_INLINE vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>)
+CMT_INLINE vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>)
 {
     return make_vector(base[Indices * Stride]...);
 }
 template <size_t Nout, typename T, size_t... Indices>
-KFR_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>)
+CMT_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>)
 {
     return make_vector(base[Indices * stride]...);
 }
 }
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices)
+CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices)
 {
     return internal::gather(base, indices, csizeseq<N>);
 }
 
 template <size_t Nout, typename T>
-KFR_INLINE vec<T, Nout> gather_stride(const T* base, size_t stride)
+CMT_INLINE vec<T, Nout> gather_stride(const T* base, size_t stride)
 {
     return internal::gather_stride_s<Nout>(base, stride, csizeseq<Nout>);
 }
 
 template <size_t Nout, size_t Stride, typename T>
-KFR_INLINE vec<T, Nout> gather_stride(const T* base)
+CMT_INLINE vec<T, Nout> gather_stride(const T* base)
 {
     return internal::gather_stride<Nout, Stride>(base, csizeseq<Nout>);
 }
 
 template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices>
-KFR_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, csizes_t<Indices...>)
+CMT_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, csizes_t<Indices...>)
 {
     return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...);
 }
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
-KFR_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
+CMT_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
 {
     return gather_helper<groupsize>(base, offset, csizeseq<N>);
 }
 
 template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices>
-KFR_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value,
+CMT_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value,
                                csizes_t<Indices...>)
 {
     swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)),
               0)... };
 }
 template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
-KFR_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
+CMT_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
 {
     return scatter_helper<groupsize>(base, offset, value, csizeseq<N>);
 }
@@ -188,14 +188,14 @@ constexpr T partial_masks[] = { internal::allones<T>,
                                 T() };
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> partial_mask(size_t index)
+CMT_INLINE vec<T, N> partial_mask(size_t index)
 {
     static_assert(N <= arraysize(partial_masks<T>) / 2,
                   "N must not be greater than half of partial_masks expression_array");
     return read<N>(&partial_masks<T>[0] + arraysize(partial_masks<T>) / 2 - index);
 }
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> partial_mask(size_t index, vec_t<T, N>)
+CMT_INLINE vec<T, N> partial_mask(size_t index, vec_t<T, N>)
 {
     return partial_mask<T, N>(index);
 }
diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp
@@ -32,14 +32,14 @@ namespace kfr
 {
 
 template <typename T>
-KFR_INLINE T final_mean(T value, size_t size)
+CMT_INLINE T final_mean(T value, size_t size)
 {
     return value / T(size);
 }
 KFR_FN(final_mean)
 
 template <typename T>
-KFR_INLINE T final_rootmean(T value, size_t size)
+CMT_INLINE T final_rootmean(T value, size_t size)
 {
     return internal::builtin_sqrt(value / T(size));
 }
@@ -48,12 +48,12 @@ KFR_FN(final_rootmean)
 namespace internal
 {
 template <typename FinalFn, typename T, KFR_ENABLE_IF(is_callable<FinalFn, T, size_t>::value)>
-KFR_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t size, T value)
+CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t size, T value)
 {
     return finalfn(value, size);
 }
 template <typename FinalFn, typename T, KFR_ENABLE_IF(!is_callable<FinalFn, T, size_t>::value)>
-KFR_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
+CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
 {
     return finalfn(value);
 }
@@ -70,26 +70,26 @@ struct expression_reduce : output_expression
     }
 
     template <typename U, size_t N>
-    KFR_INLINE void operator()(coutput_t, size_t, const vec<U, N>& x) const
+    CMT_INLINE void operator()(coutput_t, size_t, const vec<U, N>& x) const
     {
         counter += N;
         process(x);
     }
 
-    KFR_INLINE T get() { return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn)); }
+    CMT_INLINE T get() { return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn)); }
 
 protected:
     void reset() { counter = 0; }
-    KFR_INLINE void process(vec<T, width> x) const { value = reducefn(transformfn(x), value); }
+    CMT_INLINE void process(vec<T, width> x) const { value = reducefn(transformfn(x), value); }
 
     template <size_t N, KFR_ENABLE_IF(N < width)>
-    KFR_INLINE void process(vec<T, N> x) const
+    CMT_INLINE void process(vec<T, N> x) const
     {
         value = combine(value, reducefn(transformfn(x), narrow<N>(value)));
     }
 
     template <size_t N, KFR_ENABLE_IF(N > width)>
-    KFR_INLINE void process(vec<T, N> x) const
+    CMT_INLINE void process(vec<T, N> x) const
     {
         process(low(x));
         process(high(x));
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -51,7 +51,7 @@ namespace intrinsics
 #define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TRUNC)
 #define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_NINT)
 
-#if defined CID_ARCH_SSE41
+#if defined CMT_ARCH_SSE41
 
 KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); }
 KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); }
@@ -64,7 +64,7 @@ KFR_SINTRIN f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(*v
 KFR_SINTRIN f32sse fract(const f32sse& x) { return x - floor(x); }
 KFR_SINTRIN f64sse fract(const f64sse& x) { return x - floor(x); }
 
-#if defined CID_ARCH_AVX
+#if defined CMT_ARCH_AVX
 
 KFR_SINTRIN f32avx floor(const f32avx& value) { return _mm256_floor_ps(*value); }
 KFR_SINTRIN f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(*value); }
@@ -318,6 +318,26 @@ KFR_INTRIN internal::expression_function<fn::itrunc, E1> itrunc(E1&& x)
 {
     return { fn::itrunc(), std::forward<E1>(x) };
 }
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+CMT_INLINE vec<T, N> fmod(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x - trunc(x / y) * y;
+}
+
+KFR_FN_S(fmod)
+KFR_FN(fmod)
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+constexpr CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x % y;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return fmod(x, y);
+}
 }
 
 #undef KFR_mm_trunc_ps
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -30,6 +30,8 @@ namespace kfr
 
 namespace intrinsics
 {
+
+// Generic functions
 template <typename T, size_t N>
 KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
 {
@@ -66,7 +68,7 @@ KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>
     return select(a < b, zerovector(a), a - b);
 }
 
-#if defined CID_ARCH_SSE2
+#if defined CMT_ARCH_SSE2
 
 KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); }
 KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); }
@@ -88,7 +90,7 @@ KFR_SINTRIN i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_s
 KFR_SINTRIN u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
 KFR_SINTRIN u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
 
-#if defined CID_ARCH_AVX2
+#if defined CMT_ARCH_AVX2
 KFR_SINTRIN u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(*x, *y); }
 KFR_SINTRIN i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(*x, *y); }
 KFR_SINTRIN u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(*x, *y); }
@@ -103,6 +105,31 @@ KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs
 KFR_HANDLE_ALL_SIZES_2(satadd)
 KFR_HANDLE_ALL_SIZES_2(satsub)
 
+#elif defined CMT_ARCH_NEON
+
+KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); }
+KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); }
+KFR_SINTRIN u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(*x, *y); }
+KFR_SINTRIN i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(*x, *y); }
+
+KFR_SINTRIN u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(*x, *y); }
+KFR_SINTRIN i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(*x, *y); }
+KFR_SINTRIN u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(*x, *y); }
+KFR_SINTRIN i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(*x, *y); }
+
+KFR_SINTRIN u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(*a, *b); }
+KFR_SINTRIN i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(*a, *b); }
+KFR_SINTRIN u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(*a, *b); }
+KFR_SINTRIN i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(*a, *b); }
+
+KFR_SINTRIN i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_u32(*a, *b); }
+KFR_SINTRIN i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s32(*a, *b); }
+KFR_SINTRIN u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u64(*a, *b); }
+KFR_SINTRIN u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_s64(*a, *b); }
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
 #else
 // fallback
 template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -29,7 +29,7 @@ namespace kfr
 namespace intrinsics
 {
 
-#if defined CID_ARCH_SSE41
+#if defined CMT_ARCH_SSE41
 
 KFR_SINTRIN u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
 {
@@ -72,7 +72,7 @@ KFR_SINTRIN f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
     return _mm_blendv_pd(*y, *x, *m);
 }
 
-#if defined CID_ARCH_AVX
+#if defined CMT_ARCH_AVX
 KFR_SINTRIN f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
 {
     return _mm256_blendv_pd(*y, *x, *m);
@@ -83,7 +83,7 @@ KFR_SINTRIN f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
 }
 #endif
 
-#if defined CID_ARCH_AVX2
+#if defined CMT_ARCH_AVX2
 KFR_SINTRIN u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
 {
     return _mm256_blendv_epi8(*y, *x, *m);
@@ -129,13 +129,70 @@ KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<
     return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
 }
 
+#elif defined CMT_ARCH_NEON
+
+KFR_SINTRIN f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
+{
+    return vbslq_f32(*m, *x, *y);
+}
+
+KFR_SINTRIN i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y) { return vbslq_s8(*m, *x, *y); }
+KFR_SINTRIN u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y) { return vbslq_u8(*m, *x, *y); }
+KFR_SINTRIN i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
+{
+    return vbslq_s16(*m, *x, *y);
+}
+KFR_SINTRIN u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
+{
+    return vbslq_u16(*m, *x, *y);
+}
+KFR_SINTRIN i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
+{
+    return vbslq_s32(*m, *x, *y);
+}
+KFR_SINTRIN u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
+{
+    return vbslq_u32(*m, *x, *y);
+}
+KFR_SINTRIN i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
+{
+    return vbslq_s64(*m, *x, *y);
+}
+KFR_SINTRIN u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
+{
+    return vbslq_u64(*m, *x, *y);
+}
+
+#ifdef CMT_ARCH_NEON64
+KFR_SINTRIN f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
+{
+    return vbslq_f64(*m, *x, *y);
+}
+#else
+KFR_SINTRIN f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
+{
+    return y ^ ((x ^ y) & f64neon(*m));
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
+}
+
 #else
 
 // fallback
 template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> select(mask<T, N> m, const vec<T, N>& x, const vec<T, N>& y)
+KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
 {
-    return y ^ ((x ^ y) & m);
+    return y ^ ((x ^ y) & vec<T, N>(*m));
 }
 #endif
 }
@@ -146,8 +203,7 @@ template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_nume
 KFR_INTRIN vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
 {
     static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
-    return intrinsics::select(bitcast<Tout>(m).asmask(), static_cast<vec<Tout, N>>(x),
-                              static_cast<vec<Tout, N>>(y));
+    return intrinsics::select(bitcast<Tout>(m), static_cast<vec<Tout, N>>(x), static_cast<vec<Tout, N>>(y));
 }
 
 template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
diff --git a/include/kfr/base/shuffle.hpp b/include/kfr/base/shuffle.hpp
@@ -35,20 +35,20 @@ namespace internal
 {
 
 template <size_t index, typename T>
-constexpr KFR_INLINE T broadcast_get_nth()
+constexpr CMT_INLINE T broadcast_get_nth()
 {
     return c_qnan<T>;
 }
 
 template <size_t index, typename T, typename... Ts>
-constexpr KFR_INLINE T broadcast_get_nth(T x, Ts... rest)
+constexpr CMT_INLINE T broadcast_get_nth(T x, Ts... rest)
 {
     return index == 0 ? x : broadcast_get_nth<index - 1, T>(rest...);
 }
 
 template <typename T, typename... Ts, size_t... indices, size_t Nin = 1 + sizeof...(Ts),
           size_t Nout = sizeof...(indices)>
-KFR_INLINE constexpr vec<T, Nout> broadcast_helper(csizes_t<indices...>, T x, Ts... rest)
+CMT_INLINE constexpr vec<T, Nout> broadcast_helper(csizes_t<indices...>, T x, Ts... rest)
 {
     simd<T, Nout> result{ broadcast_get_nth<indices % Nin>(x, rest...)... };
     return result;
@@ -56,46 +56,46 @@ KFR_INLINE constexpr vec<T, Nout> broadcast_helper(csizes_t<indices...>, T x, Ts
 }
 
 template <size_t Nout, typename T, typename... Ts>
-constexpr KFR_INLINE vec<T, Nout> broadcast(T x, T y, Ts... rest)
+constexpr CMT_INLINE vec<T, Nout> broadcast(T x, T y, Ts... rest)
 {
     return internal::broadcast_helper(csizeseq<Nout>, x, y, rest...);
 }
 KFR_FN(broadcast)
 
 template <size_t Ncount, typename T, size_t N>
-KFR_INLINE vec<T, N + Ncount> padhigh(const vec<T, N>& x)
+CMT_INLINE vec<T, N + Ncount> padhigh(const vec<T, N>& x)
 {
     return shufflevector<N + Ncount, internal::shuffle_index_extend<0, N>>(x);
 }
 KFR_FN(padhigh)
 
 template <size_t Ncount, typename T, size_t N>
-KFR_INLINE vec<T, N + Ncount> padlow(const vec<T, N>& x)
+CMT_INLINE vec<T, N + Ncount> padlow(const vec<T, N>& x)
 {
     return shufflevector<N + Ncount, internal::shuffle_index_extend<Ncount, N>>(x);
 }
 KFR_FN(padlow)
 
 template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N != Nout)>
-KFR_INLINE vec<T, Nout> extend(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index_extend<0, N>>(x);
 }
 template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N == Nout)>
-constexpr KFR_INLINE vec<T, Nout> extend(const vec<T, N>& x)
+constexpr CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x)
 {
     return x;
 }
 KFR_FN(extend)
 
 template <size_t start, size_t count, typename T, size_t N>
-KFR_INLINE vec<T, count> slice(const vec<T, N>& x)
+CMT_INLINE vec<T, count> slice(const vec<T, N>& x)
 {
     static_assert(start + count <= N, "start + count <= N");
     return shufflevector<count, internal::shuffle_index<start>>(x);
 }
 template <size_t start, size_t count, typename T, size_t N>
-KFR_INLINE vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y)
+CMT_INLINE vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y)
 {
     static_assert(start + count <= N * 2, "start + count <= N * 2");
     return shufflevector<count, internal::shuffle_index<start>>(x, y);
@@ -103,11 +103,11 @@ KFR_INLINE vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y)
 KFR_FN(slice)
 
 template <size_t, typename T, size_t N>
-KFR_INLINE void split(const vec<T, N>&)
+CMT_INLINE void split(const vec<T, N>&)
 {
 }
 template <size_t start = 0, typename T, size_t N, size_t Nout, typename... Args>
-KFR_INLINE void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args)
+CMT_INLINE void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args)
 {
     out = slice<start, Nout>(x);
     split<start + Nout>(x, std::forward<Args>(args)...);
@@ -115,7 +115,7 @@ KFR_INLINE void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args)
 KFR_FN(split)
 
 template <size_t total, size_t number, typename T, size_t N, size_t Nout = N / total>
-KFR_INLINE vec<T, Nout> part(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> part(const vec<T, N>& x)
 {
     static_assert(N % total == 0, "N % total == 0");
     return shufflevector<Nout, internal::shuffle_index<number * Nout>>(x);
@@ -123,27 +123,27 @@ KFR_INLINE vec<T, Nout> part(const vec<T, N>& x)
 KFR_FN(part)
 
 template <size_t start, size_t count, typename T, size_t N1, size_t N2>
-KFR_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
+CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     return internal::concattwo<start, count>(x, y);
 }
 KFR_FN(concat_and_slice)
 
 template <size_t Nout, typename T, size_t N>
-KFR_INLINE vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T())
+CMT_INLINE vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T())
 {
     static_assert(Nout > N, "Nout > N");
     return concat(x, broadcast<Nout - N>(newvalue));
 }
 template <size_t Nout, typename T, typename TS>
-constexpr KFR_INLINE vec<T, Nout> widen(const vec<T, Nout>& x, TS)
+constexpr CMT_INLINE vec<T, Nout> widen(const vec<T, Nout>& x, TS)
 {
     return x;
 }
 KFR_FN(widen)
 
 template <size_t Nout, typename T, size_t N>
-KFR_INLINE vec<T, Nout> narrow(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> narrow(const vec<T, N>& x)
 {
     static_assert(Nout <= N, "Nout <= N");
     return slice<0, Nout>(x);
@@ -152,7 +152,7 @@ KFR_FN(narrow)
 
 template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N / 2,
           KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
-KFR_INLINE vec<T, Nout> even(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> even(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index<0, 2>, groupsize>(x);
 }
@@ -160,7 +160,7 @@ KFR_FNR(even, 2, 1)
 
 template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N / 2,
           KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
-KFR_INLINE vec<T, Nout> odd(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> odd(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index<1, 2>, groupsize>(x);
 }
@@ -182,7 +182,7 @@ struct shuffle_index_dup
 }
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> dupeven(const vec<T, N>& x)
+CMT_INLINE vec<T, N> dupeven(const vec<T, N>& x)
 {
     static_assert(N % 2 == 0, "N must be even");
     return shufflevector<N, internal::shuffle_index_dup<2, 0>>(x);
@@ -190,7 +190,7 @@ KFR_INLINE vec<T, N> dupeven(const vec<T, N>& x)
 KFR_FN(dupeven)
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> dupodd(const vec<T, N>& x)
+CMT_INLINE vec<T, N> dupodd(const vec<T, N>& x)
 {
     static_assert(N % 2 == 0, "N must be even");
     return shufflevector<N, internal::shuffle_index_dup<2, 1>>(x);
@@ -198,7 +198,7 @@ KFR_INLINE vec<T, N> dupodd(const vec<T, N>& x)
 KFR_FN(dupodd)
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N * 2> duphalfs(const vec<T, N>& x)
+CMT_INLINE vec<T, N * 2> duphalfs(const vec<T, N>& x)
 {
     return concat(x, x);
 }
@@ -221,7 +221,7 @@ struct shuffle_index_shuffle
 }
 
 template <size_t... Indices, typename T, size_t N>
-KFR_INLINE vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y,
+CMT_INLINE vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y,
                              elements_t<Indices...> = elements_t<Indices...>())
 {
     return shufflevector<N, internal::shuffle_index_shuffle<N, Indices...>>(x, y);
@@ -229,7 +229,7 @@ KFR_INLINE vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y,
 KFR_FN(shuffle)
 
 template <size_t groupsize, size_t... Indices, typename T, size_t N>
-KFR_INLINE vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y,
+CMT_INLINE vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y,
                                    elements_t<Indices...> = elements_t<Indices...>())
 {
     return shufflevector<N, internal::shuffle_index_shuffle<N, Indices...>, groupsize>(x, y);
@@ -254,14 +254,14 @@ struct shuffle_index_permute
 }
 
 template <size_t... Indices, typename T, size_t N>
-KFR_INLINE vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> = elements_t<Indices...>())
+CMT_INLINE vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> = elements_t<Indices...>())
 {
     return shufflevector<N, internal::shuffle_index_permute<N, Indices...>>(x);
 }
 KFR_FN(permute)
 
 template <size_t groupsize, size_t... Indices, typename T, size_t N>
-KFR_INLINE vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> = elements_t<Indices...>())
+CMT_INLINE vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> = elements_t<Indices...>())
 {
     return shufflevector<N, internal::shuffle_index_permute<N, Indices...>, groupsize>(x);
 }
@@ -271,7 +271,7 @@ namespace internal
 {
 
 template <typename T, size_t Nout, typename Fn, size_t... Indices>
-constexpr KFR_INLINE vec<T, Nout> generate_vector(csizes_t<Indices...>)
+constexpr CMT_INLINE vec<T, Nout> generate_vector(csizes_t<Indices...>)
 {
     constexpr Fn fn{};
     return make_vector(static_cast<T>(fn(Indices))...);
@@ -279,7 +279,7 @@ constexpr KFR_INLINE vec<T, Nout> generate_vector(csizes_t<Indices...>)
 }
 
 template <typename T, size_t Nout, typename Fn>
-constexpr KFR_INLINE vec<T, Nout> generate_vector()
+constexpr CMT_INLINE vec<T, Nout> generate_vector()
 {
     return internal::generate_vector<T, Nout, Fn>(csizeseq<Nout>);
 }
@@ -288,19 +288,19 @@ KFR_FN(generate_vector)
 namespace internal
 {
 template <typename T, size_t N>
-constexpr KFR_INLINE mask<T, N> evenmask()
+constexpr CMT_INLINE mask<T, N> evenmask()
 {
     return broadcast<N, T>(maskbits<T>(true), maskbits<T>(false));
 }
 template <typename T, size_t N>
-constexpr KFR_INLINE mask<T, N> oddmask()
+constexpr CMT_INLINE mask<T, N> oddmask()
 {
     return broadcast<N, T>(maskbits<T>(false), maskbits<T>(true));
 }
 }
 
 template <typename T, size_t N, size_t Nout = N * 2>
-KFR_INLINE vec<T, Nout> dup(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> dup(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index_dup1<2>>(x, x);
 }
@@ -316,7 +316,7 @@ struct shuffle_index_duphalf
 }
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> duplow(const vec<T, N>& x)
+CMT_INLINE vec<T, N> duplow(const vec<T, N>& x)
 {
     static_assert(N % 2 == 0, "N must be even");
     return shufflevector<N, internal::shuffle_index_duphalf<N / 2, 0>>(x);
@@ -324,7 +324,7 @@ KFR_INLINE vec<T, N> duplow(const vec<T, N>& x)
 KFR_FN(duplow)
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> duphigh(vec<T, N> x)
+CMT_INLINE vec<T, N> duphigh(vec<T, N> x)
 {
     static_assert(N % 2 == 0, "N must be even");
     return shufflevector<N, internal::shuffle_index_duphalf<N / 2, N / 2>>(x);
@@ -347,7 +347,7 @@ struct shuffle_index_blend
 }
 
 template <size_t... Indices, typename T, size_t N>
-KFR_INLINE vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y,
+CMT_INLINE vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y,
                            elements_t<Indices...> = elements_t<Indices...>())
 {
     return shufflevector<N, internal::shuffle_index_blend<N, Indices...>, 1>(x, y);
@@ -376,20 +376,20 @@ struct shuffle_index_outputright
 }
 
 template <size_t elements, typename T, size_t N>
-KFR_INLINE vec<T, N> swap(vec<T, N> x)
+CMT_INLINE vec<T, N> swap(const vec<T, N>& x)
 {
     return shufflevector<N, internal::shuffle_index_swap<elements>>(x);
 }
-KFR_FN(swap)
+CMT_FN_TPL((size_t elements), (elements), swap)
 
 template <size_t shift, typename T, size_t N>
-KFR_INLINE vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi)
+CMT_INLINE vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi)
 {
     return shift == 0 ? lo : (shift == N ? hi : shufflevector<N, internal::shuffle_index<N - shift>>(hi, lo));
 }
 
 template <size_t amount, typename T, size_t N>
-KFR_INLINE vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
+CMT_INLINE vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
 {
     static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
     return shufflevector<N, internal::shuffle_index_wrap<N, N - amount>>(x);
@@ -397,7 +397,7 @@ KFR_INLINE vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<a
 KFR_FN(rotateright)
 
 template <size_t amount, typename T, size_t N>
-KFR_INLINE vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
+CMT_INLINE vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
 {
     static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
     return shufflevector<N, internal::shuffle_index_wrap<N, amount>>(x);
@@ -405,21 +405,21 @@ KFR_INLINE vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<am
 KFR_FN(rotateleft)
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> insertright(T x, const vec<T, N>& y)
+CMT_INLINE vec<T, N> insertright(T x, const vec<T, N>& y)
 {
     return concat_and_slice<1, N>(y, vec<T, 1>(x));
 }
 KFR_FN(insertright)
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> insertleft(T x, const vec<T, N>& y)
+CMT_INLINE vec<T, N> insertleft(T x, const vec<T, N>& y)
 {
     return concat_and_slice<0, N>(vec<T, 1>(x), y);
 }
 KFR_FN(insertleft)
 
 template <typename T, size_t N, size_t N2>
-KFR_INLINE vec<T, N> outputright(const vec<T, N>& x, const vec<T, N2>& y)
+CMT_INLINE vec<T, N> outputright(const vec<T, N>& x, const vec<T, N2>& y)
 {
     return shufflevector<N, internal::shuffle_index_outputright<N2, N>>(x, extend<N>(y));
 }
@@ -439,46 +439,51 @@ struct shuffle_index_transpose
 }
 
 template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize > 3)>
-KFR_INLINE vec<T, N> transpose(const vec<T, N>& x)
+CMT_INLINE vec<T, N> transpose(const vec<T, N>& x)
 {
     return shufflevector<N, internal::shuffle_index_transpose<N / groupsize, side>, groupsize>(x);
 }
 template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)>
-KFR_INLINE vec<T, N> transpose(const vec<T, N>& x)
+CMT_INLINE vec<T, N> transpose(const vec<T, N>& x)
 {
     return x;
 }
+template <typename T, size_t N>
+CMT_INLINE vec<vec<T, N>, N> transpose(const vec<vec<T, N>, N>& x)
+{
+    return *transpose<2>(flatten(x));
+}
 KFR_FN(transpose)
 
 template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize > 3)>
-KFR_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
+CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
 {
     return shufflevector<N, internal::shuffle_index_transpose<N / groupsize, N / groupsize / side>,
                          groupsize>(x);
 }
 template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)>
-KFR_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
+CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
 {
     return x;
 }
 KFR_FN(transposeinverse)
 
 template <size_t side, typename T, size_t N>
-KFR_INLINE vec<T, N> ctranspose(const vec<T, N>& x)
+CMT_INLINE vec<T, N> ctranspose(const vec<T, N>& x)
 {
     return transpose<side, 2>(x);
 }
 KFR_FN(ctranspose)
 
 template <size_t side, typename T, size_t N>
-KFR_INLINE vec<T, N> ctransposeinverse(const vec<T, N>& x)
+CMT_INLINE vec<T, N> ctransposeinverse(const vec<T, N>& x)
 {
     return transposeinverse<side, 2>(x);
 }
 KFR_FN(ctransposeinverse)
 
 template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N * 2>
-KFR_INLINE vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y)
+CMT_INLINE vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y)
 {
     return shufflevector<Nout, internal::shuffle_index_transpose<Nout / groupsize, Nout / groupsize / 2>,
                          groupsize>(x, y);
@@ -486,13 +491,13 @@ KFR_INLINE vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y)
 KFR_FNR(interleave, 1, 2)
 
 template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INLINE internal::expression_function<fn_interleave, E1, E2> interleave(E1&& x, E2&& y)
+CMT_INLINE internal::expression_function<fn_interleave, E1, E2> interleave(E1&& x, E2&& y)
 {
     return { fn_interleave(), std::forward<E1>(x), std::forward<E2>(y) };
 }
 
 template <size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> interleavehalfs(const vec<T, N>& x)
+CMT_INLINE vec<T, N> interleavehalfs(const vec<T, N>& x)
 {
     return shufflevector<N, internal::shuffle_index_transpose<N / groupsize, N / groupsize / 2>, groupsize>(
         x);
@@ -500,7 +505,7 @@ KFR_INLINE vec<T, N> interleavehalfs(const vec<T, N>& x)
 KFR_FN(interleavehalfs)
 
 template <size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> splitpairs(const vec<T, N>& x)
+CMT_INLINE vec<T, N> splitpairs(const vec<T, N>& x)
 {
     return shufflevector<N, internal::shuffle_index_transpose<N / groupsize, 2>, groupsize>(x);
 }
@@ -516,10 +521,15 @@ struct shuffle_index_reverse
 }
 
 template <size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> reverse(const vec<T, N>& x)
+CMT_INLINE vec<T, N> reverse(const vec<T, N>& x)
 {
     return shufflevector<N, internal::shuffle_index_reverse<N / groupsize>, groupsize>(x);
 }
+template <typename T, size_t N1, size_t N2>
+CMT_INLINE vec<vec<T, N1>, N2> reverse(const vec<vec<T, N1>, N2>& x)
+{
+    return *swap<N1>(flatten(x));
+}
 KFR_FN(reverse)
 
 namespace internal
@@ -532,7 +542,7 @@ struct shuffle_index_combine
 }
 
 template <typename T, size_t N1, size_t N2>
-KFR_INLINE vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y)
+CMT_INLINE vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     static_assert(N2 <= N1, "N2 <= N1");
     return shufflevector<N1, internal::shuffle_index_combine<N1, N2>>(x, extend<N1>(y));
@@ -557,28 +567,27 @@ struct generate_onoff
 }
 
 template <typename T, size_t N, size_t start = 0, size_t stride = 1>
-constexpr KFR_INLINE vec<T, N> enumerate()
+constexpr CMT_INLINE vec<T, N> enumerate()
 {
     return generate_vector<T, N, internal::generate_index<start, stride>>();
 }
 template <size_t start = 0, size_t stride = 1, typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> enumerate(vec_t<T, N>)
+constexpr CMT_INLINE vec<T, N> enumerate(vec_t<T, N>)
 {
     return generate_vector<T, N, internal::generate_index<start, stride>>();
 }
 KFR_FN(enumerate)
 
 template <typename T, size_t N, size_t start = 0, size_t size = 1, int on = 1, int off = 0>
-constexpr KFR_INLINE vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
+constexpr CMT_INLINE vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
 {
     return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
 }
 template <size_t start = 0, size_t size = 1, int on = 1, int off = 0, typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> onoff(vec_t<T, N>, cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
+constexpr CMT_INLINE vec<T, N> onoff(vec_t<T, N>, cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
 {
     return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
 }
 KFR_FN(onoff)
 }
-#define KFR_SHUFFLE_SPECIALIZATIONS
 #include "specializations.i"
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -31,7 +31,7 @@
 #include "select.hpp"
 #include "shuffle.hpp"
 
-#if CID_HAS_WARNING("-Wc99-extensions")
+#if CMT_HAS_WARNING("-Wc99-extensions")
 #pragma clang diagnostic ignored "-Wc99-extensions"
 #endif
 
diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp
@@ -37,7 +37,7 @@ namespace kfr
  * @endcode
  */
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> sort(const vec<T, N>& x)
+CMT_INLINE vec<T, N> sort(const vec<T, N>& x)
 {
     constexpr size_t Nhalf = N / 2;
     vec<T, Nhalf> e = low(x);
@@ -70,7 +70,7 @@ KFR_INLINE vec<T, N> sort(const vec<T, N>& x)
  * @endcode
  */
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> sortdesc(const vec<T, N>& x)
+CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x)
 {
     constexpr size_t Nhalf = N / 2;
     vec<T, Nhalf> e = low(x);
diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp
@@ -30,7 +30,7 @@ namespace kfr
 namespace intrinsics
 {
 
-#if defined CID_ARCH_SSE2
+#if defined CMT_ARCH_SSE2
 
 KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); }
 KFR_SINTRIN f64x1 sqrt(const f64x1& x)
@@ -40,7 +40,7 @@ KFR_SINTRIN f64x1 sqrt(const f64x1& x)
 KFR_SINTRIN f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(*x); }
 KFR_SINTRIN f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(*x); }
 
-#if defined CID_ARCH_AVX
+#if defined CMT_ARCH_AVX
 KFR_SINTRIN f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(*x); }
 KFR_SINTRIN f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(*x); }
 #endif
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -41,7 +41,7 @@
     struct fn_##fn                                                                                           \
     {                                                                                                        \
         template <typename... Args>                                                                          \
-        CID_INLINE_MEMBER decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const             \
+        CMT_INLINE_MEMBER decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const             \
         {                                                                                                    \
             return fn(std::forward<Args>(args)...);                                                          \
         }                                                                                                    \
@@ -53,7 +53,7 @@
     struct FN                                                                                                \
     {                                                                                                        \
         template <typename... Args>                                                                          \
-        CID_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()(               \
+        CMT_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()(               \
             Args&&... args) const                                                                            \
         {                                                                                                    \
             return ::kfr::intrinsics::FN(std::forward<Args>(args)...);                                       \
@@ -66,7 +66,7 @@
     {                                                                                                        \
         using ratio = ioratio<in, out>;                                                                      \
         template <typename... Args>                                                                          \
-        CID_INLINE_MEMBER decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const             \
+        CMT_INLINE_MEMBER decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const             \
         {                                                                                                    \
             return fn(std::forward<Args>(args)...);                                                          \
         }                                                                                                    \
@@ -77,7 +77,7 @@
     {                                                                                                        \
         constexpr fn_##fn() noexcept = default;                                                              \
         template <typename... Args>                                                                          \
-        KFR_INLINE decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const                    \
+        CMT_INLINE decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const                    \
         {                                                                                                    \
             return fn(std::forward<Args>(args)...);                                                          \
         }                                                                                                    \
@@ -102,7 +102,7 @@ using imax = int64_t;
 using fmax = double;
 using f80  = long double;
 
-#ifdef KFR_BASETYPE_F32
+#if defined(KFR_BASETYPE_F32) || defined(KFR_NO_NATIVE_F64)
 using fbase = f32;
 #else
 using fbase = f64;
@@ -200,7 +200,7 @@ inline datatype operator&(datatype x, datatype y)
 struct generic
 {
     template <typename T>
-    KFR_INLINE constexpr operator T() const noexcept
+    CMT_INLINE constexpr operator T() const noexcept
     {
         return T();
     }
@@ -209,7 +209,7 @@ struct generic
 struct infinite
 {
     template <typename T>
-    KFR_INLINE constexpr operator T() const noexcept
+    CMT_INLINE constexpr operator T() const noexcept
     {
         return T();
     }
@@ -234,9 +234,9 @@ enum class archendianness : int
     _archendianness_max = static_cast<int>(bigendian)
 };
 
-typedef void*(KFR_CDECL* func_allocate)(size_t);
+typedef void*(CMT_CDECL* func_allocate)(size_t);
 
-typedef void(KFR_CDECL* func_deallocate)(void*);
+typedef void(CMT_CDECL* func_deallocate)(void*);
 
 struct mem_allocator
 {
@@ -328,7 +328,7 @@ constexpr inline ptrdiff_t distance(const void* x, const void* y)
 enum class cpu_t : int
 {
     common = 0,
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
     sse2    = 1,
     sse3    = 2,
     ssse3   = 3,
@@ -340,15 +340,18 @@ enum class cpu_t : int
     lowest  = static_cast<int>(sse2),
     highest = static_cast<int>(avx2),
 #endif
-#ifdef CID_ARCH_ARM
+#ifdef CMT_ARCH_ARM
     neon    = 1,
+    neon64  = 2,
     lowest  = static_cast<int>(neon),
-    highest = static_cast<int>(neon),
+    highest = static_cast<int>(neon64),
 #endif
-    native  = static_cast<int>(KFR_ARCH_NAME),
+    native  = static_cast<int>(CMT_ARCH_NAME),
     runtime = -1,
 };
 
+#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native
+
 template <cpu_t cpu>
 using ccpu_t = cval_t<cpu_t, cpu>;
 
@@ -360,7 +363,7 @@ namespace internal
 constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); }
 constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); }
 
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
 constexpr auto cpu_list =
     cvals<cpu_t, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3, cpu_t::sse3, cpu_t::sse2>;
 #else
@@ -516,23 +519,23 @@ using enable_if_not_f = enable_if<typeclass<T> != datatype::f, R>;
 
 namespace internal
 {
-KFR_INLINE f32 builtin_sqrt(f32 x) { return __builtin_sqrtf(x); }
-KFR_INLINE f64 builtin_sqrt(f64 x) { return __builtin_sqrt(x); }
-KFR_INLINE f80 builtin_sqrt(f80 x) { return __builtin_sqrtl(x); }
-KFR_INLINE void builtin_memcpy(void* dest, const void* src, size_t size)
+CMT_INLINE f32 builtin_sqrt(f32 x) { return __builtin_sqrtf(x); }
+CMT_INLINE f64 builtin_sqrt(f64 x) { return __builtin_sqrt(x); }
+CMT_INLINE f80 builtin_sqrt(f80 x) { return __builtin_sqrtl(x); }
+CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size)
 {
     __builtin_memcpy(dest, src, size);
 }
-KFR_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); }
+CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); }
 template <typename T1>
-KFR_INLINE void zeroize(T1& value)
+CMT_INLINE void zeroize(T1& value)
 {
     builtin_memset(static_cast<void*>(std::addressof(value)), 0, sizeof(T1));
 }
 }
 
 #pragma clang diagnostic push
-#if CID_HAS_WARNING("-Wundefined-reinterpret-cast")
+#if CMT_HAS_WARNING("-Wundefined-reinterpret-cast")
 #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
 #endif
 
@@ -578,6 +581,12 @@ constexpr inline static const T* derived_cast(const U* ptr)
     return static_cast<const T*>(ptr);
 }
 
+template <typename T, typename U>
+constexpr inline static T implicit_cast(U&& value)
+{
+    return std::forward<T>(value);
+}
+
 #pragma clang diagnostic pop
 
 __attribute__((unused)) static const char* cpu_name(cpu_t set)
@@ -590,7 +599,7 @@ __attribute__((unused)) static const char* cpu_name(cpu_t set)
 
 #define KFR_FN_S(fn)                                                                                         \
     template <typename Arg, typename... Args>                                                                \
-    KFR_INLINE enable_if_not_vec<Arg> fn(Arg arg, Args... args)                                              \
+    CMT_INLINE enable_if_not_vec<Arg> fn(Arg arg, Args... args)                                              \
     {                                                                                                        \
         return fn(make_vector(arg), make_vector(args)...)[0];                                                \
     }
@@ -649,7 +658,7 @@ constexpr size_t widthof()
 template <typename T>
 constexpr inline const T& bitness_const(const T& x32, const T& x64)
 {
-#ifdef KFR_ARCH_X64
+#ifdef CMT_ARCH_X64
     (void)x32;
     return x64;
 #else
@@ -660,7 +669,7 @@ constexpr inline const T& bitness_const(const T& x32, const T& x64)
 
 constexpr inline const char* bitness_const(const char* x32, const char* x64)
 {
-#ifdef KFR_ARCH_X64
+#ifdef CMT_ARCH_X64
     (void)x32;
     return x64;
 #else
@@ -680,18 +689,18 @@ constexpr size_t common_int_vector_size   = 16;
 
 template <cpu_t c>
 constexpr size_t native_float_vector_size =
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
     c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size;
 #endif
-#ifdef CID_ARCH_ARM
+#ifdef CMT_ARCH_ARM
 c == cpu_t::neon ? 16 : common_float_vector_size;
 #endif
 template <cpu_t c>
 constexpr size_t native_int_vector_size =
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
     c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size;
 #endif
-#ifdef CID_ARCH_ARM
+#ifdef CMT_ARCH_ARM
 c == cpu_t::neon ? 16 : common_int_vector_size;
 #endif
 
@@ -701,8 +710,8 @@ struct input_expression
     using size_type  = infinite;
     constexpr size_type size() const noexcept { return {}; }
 
-    KFR_INLINE void begin_block(size_t) const {}
-    KFR_INLINE void end_block(size_t) const {}
+    CMT_INLINE void begin_block(size_t) const {}
+    CMT_INLINE void end_block(size_t) const {}
 };
 
 struct output_expression
@@ -711,8 +720,8 @@ struct output_expression
     using size_type  = infinite;
     constexpr size_type size() const noexcept { return {}; }
 
-    KFR_INLINE void output_begin_block(size_t) const {}
-    KFR_INLINE void output_end_block(size_t) const {}
+    CMT_INLINE void output_begin_block(size_t) const {}
+    CMT_INLINE void output_end_block(size_t) const {}
 };
 
 template <typename E>
@@ -731,8 +740,9 @@ template <typename... Ts>
 using is_numeric_args = and_t<is_numeric<Ts>...>;
 
 template <typename T, cpu_t c = cpu_t::native>
-constexpr size_t vector_width = typeclass<T> == datatype::f ? native_float_vector_size<c> / sizeof(T)
-                                                            : native_int_vector_size<c> / sizeof(T);
+constexpr size_t vector_width = const_max(size_t(1), typeclass<T> == datatype::f
+                                                         ? native_float_vector_size<c> / sizeof(T)
+                                                         : native_int_vector_size<c> / sizeof(T));
 
 template <cpu_t c>
 constexpr size_t vector_width<void, c> = 0;
@@ -741,11 +751,11 @@ namespace internal
 {
 
 template <cpu_t c>
-constexpr size_t native_vector_alignment = std::max(native_float_vector_size<c>, native_int_vector_size<c>);
+constexpr size_t native_vector_alignment = const_max(native_float_vector_size<c>, native_int_vector_size<c>);
 
 template <cpu_t c>
 constexpr bool fast_unaligned =
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
     c >= cpu_t::avx1;
 #else
     false;
@@ -772,7 +782,7 @@ template <typename T, cpu_t c>
 constexpr size_t vector_capacity = native_register_count* vector_width<T, c>;
 
 template <typename T, cpu_t c>
-constexpr size_t maximum_vector_size = std::min(static_cast<size_t>(32), vector_capacity<T, c> / 4);
+constexpr size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity<T, c> / 4);
 }
 }
 namespace cometa
@@ -781,10 +791,12 @@ namespace cometa
 template <typename T, size_t N>
 struct compound_type_traits<kfr::vec_t<T, N>>
 {
-    constexpr static size_t width   = N;
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar = false;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
 
     template <typename U>
     using rebind = kfr::vec_t<U, N>;
diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp
@@ -40,20 +40,20 @@ template <typename T, typename Class>
 struct univector_base : input_expression, output_expression
 {
     template <typename U, size_t N>
-    KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
+    CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
     {
         T* data = derived_cast<Class>(this)->data();
-        write(ptr_cast<T>(data) + index, cast<T>(value));
+        write(ptr_cast<T>(data) + index, vec<T, N>(value));
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         const T* data = derived_cast<Class>(this)->data();
-        return cast<U>(read<N>(ptr_cast<T>(data) + index));
+        return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index));
     }
 
     template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
-    KFR_INLINE Class& operator=(Input&& input)
+    CMT_INLINE Class& operator=(Input&& input)
     {
         assign_expr(std::forward<Input>(input));
         return *derived_cast<Class>(this);
@@ -126,16 +126,16 @@ struct univector_base : input_expression, output_expression
 
 protected:
     template <typename Input>
-    KFR_INLINE void assign_expr(Input&& input)
+    CMT_INLINE void assign_expr(Input&& input)
     {
         process<T>(*this, std::forward<Input>(input), get_size());
     }
 
 private:
     constexpr infinite size() const noexcept = delete;
-    KFR_INLINE size_t get_size() const { return derived_cast<Class>(this)->size(); }
-    KFR_INLINE const T* get_data() const { return derived_cast<Class>(this)->data(); }
-    KFR_INLINE T* get_data() { return derived_cast<Class>(this)->data(); }
+    CMT_INLINE size_t get_size() const { return derived_cast<Class>(this)->size(); }
+    CMT_INLINE const T* get_data() const { return derived_cast<Class>(this)->data(); }
+    CMT_INLINE T* get_data() { return derived_cast<Class>(this)->data(); }
 };
 
 template <typename T, size_t Size>
@@ -197,7 +197,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T
     constexpr static bool is_array_ref = true;
     constexpr static bool is_vector    = false;
     constexpr static bool is_aligned   = false;
-    using value_type                   = T;
+    using value_type                   = remove_const<T>;
 
     using univector_base<T, univector>::operator=;
 };
@@ -249,39 +249,39 @@ template <typename T, size_t Size1 = tag_dynamic_vector, size_t Size2 = tag_dyna
 using univector3d      = univector<univector<univector<T, Size3>, Size2>, Size1>;
 
 template <cpu_t c = cpu_t::native, size_t Tag, typename T, typename Fn>
-KFR_INLINE void process(univector<T, Tag>& vector, Fn&& fn)
+CMT_INLINE void process(univector<T, Tag>& vector, Fn&& fn)
 {
     static_assert(is_input_expression<Fn>::value, "Fn must be an expression");
     return process<T, c>(vector, std::forward<Fn>(fn), vector.size());
 }
 
 template <cpu_t c = cpu_t::native, typename T, size_t Nsize, typename Fn>
-KFR_INLINE void process(T (&dest)[Nsize], Fn&& fn)
+CMT_INLINE void process(T (&dest)[Nsize], Fn&& fn)
 {
     static_assert(is_input_expression<Fn>::value, "Fn must be an expression");
     return process<T, c>(univector<T, tag_array_ref>(dest), std::forward<Fn>(fn), Nsize);
 }
 template <cpu_t c = cpu_t::native, typename T, typename Fn>
-KFR_INLINE void process(const array_ref<T>& vector, Fn&& fn)
+CMT_INLINE void process(const array_ref<T>& vector, Fn&& fn)
 {
     static_assert(is_input_expression<Fn>::value, "Fn must be an expression");
     return process<T, c>(univector<T, tag_array_ref>(vector), std::forward<Fn>(fn), vector.size());
 }
 
 template <typename T>
-KFR_INLINE univector_ref<T> make_univector(T* data, size_t size)
+CMT_INLINE univector_ref<T> make_univector(T* data, size_t size)
 {
     return univector_ref<T>(data, size);
 }
 
 template <typename T>
-KFR_INLINE univector_ref<const T> make_univector(const T* data, size_t size)
+CMT_INLINE univector_ref<const T> make_univector(const T* data, size_t size)
 {
     return univector_ref<const T>(data, size);
 }
 
 template <typename Expr, typename T = value_type_of<Expr>>
-KFR_INLINE univector<T> render(Expr&& expr)
+CMT_INLINE univector<T> render(Expr&& expr)
 {
     univector<T> result;
     result.resize(expr.size());
@@ -290,7 +290,7 @@ KFR_INLINE univector<T> render(Expr&& expr)
 }
 
 template <typename Expr, typename T = value_type_of<Expr>>
-KFR_INLINE univector<T> render(Expr&& expr, size_t size)
+CMT_INLINE univector<T> render(Expr&& expr, size_t size)
 {
     univector<T> result;
     result.resize(size);
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -86,20 +86,20 @@ using vec_algn = internal::struct_with_alignment<simd<T, N>, A>;
 template <typename T, size_t N, bool A>
 struct vec_ptr
 {
-    constexpr KFR_INLINE vec_ptr(T* data) noexcept : data(data) {}
-    constexpr KFR_INLINE vec_ptr(const T* data) noexcept : data(const_cast<T*>(data)) {}
-    KFR_INLINE const vec_algn<T, N, A>& operator[](size_t i) const
+    constexpr CMT_INLINE vec_ptr(T* data) noexcept : data(data) {}
+    constexpr CMT_INLINE vec_ptr(const T* data) noexcept : data(const_cast<T*>(data)) {}
+    CMT_INLINE const vec_algn<T, N, A>& operator[](size_t i) const
     {
         return *static_cast<vec_algn<T, N, A>*>(data + i);
     }
-    KFR_INLINE vec_algn<T, N, A>& operator[](size_t i) { return *static_cast<vec_algn<T, N, A>*>(data + i); }
+    CMT_INLINE vec_algn<T, N, A>& operator[](size_t i) { return *static_cast<vec_algn<T, N, A>*>(data + i); }
     T* data;
 };
 
 template <typename To, typename From, size_t N,
           KFR_ENABLE_IF(std::is_same<subtype<From>, subtype<To>>::value),
           size_t Nout = N* compound_type_traits<From>::width / compound_type_traits<To>::width>
-constexpr KFR_INLINE vec<To, Nout> subcast(const vec<From, N>& value) noexcept
+constexpr CMT_INLINE vec<To, Nout> compcast(const vec<From, N>& value) noexcept
 {
     return *value;
 }
@@ -127,7 +127,7 @@ get_vec_index(int = 0)
 constexpr size_t index_undefined = static_cast<size_t>(-1);
 
 template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(!is_compound<T>::value)>
-KFR_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...>, const vec<T, N>& x,
+CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...>, const vec<T, N>& x,
                                                     const vec<T, N>& y)
 {
     vec<T, sizeof...(Indices)> result = __builtin_shufflevector(
@@ -151,22 +151,22 @@ constexpr auto inflate(csize_t<groupsize>, csizes_t<indices...>)
 }
 
 template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(is_compound<T>::value)>
-KFR_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...> indices, const vec<T, N>& x,
+CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...> indices, const vec<T, N>& x,
                                                     const vec<T, N>& y)
 {
-    return subcast<T>(
-        shufflevector(inflate(csize<widthof<T>()>, indices), subcast<subtype<T>>(x), subcast<subtype<T>>(y)));
+    return compcast<T>(shufflevector(inflate(csize<widthof<T>()>, indices), compcast<subtype<T>>(x),
+                                     compcast<subtype<T>>(y)));
 }
 
 template <size_t... Indices, size_t Nout = sizeof...(Indices), typename T, size_t N>
-KFR_INLINE vec<T, Nout> shufflevector(csizes_t<Indices...>, const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> shufflevector(csizes_t<Indices...>, const vec<T, N>& x)
 {
     return internal::shufflevector<T, N>(csizes<Indices...>, x, x);
 }
 
 template <typename Fn, size_t groupsize, typename T, size_t N, size_t... Indices,
           size_t Nout = sizeof...(Indices)>
-KFR_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y, cvals_t<size_t, Indices...>)
+CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y, cvals_t<size_t, Indices...>)
 {
     static_assert(N % groupsize == 0, "N % groupsize == 0");
     return internal::shufflevector<T, N>(
@@ -175,13 +175,13 @@ KFR_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y, cv
 }
 
 template <size_t Nout, typename Fn, size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y)
+CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y)
 {
     return internal::shufflevector<Fn, groupsize>(x, y, csizeseq<Nout>);
 }
 
 template <size_t Nout, typename Fn, size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x)
 {
     return internal::shufflevector<Fn, groupsize>(x, x, csizeseq<Nout>);
 }
@@ -225,110 +225,165 @@ constexpr swiz<14> s14{};
 constexpr swiz<15> s15{};
 }
 
-template <typename To, typename From, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To cast(From value) noexcept
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+
+template <size_t N, typename T>
+constexpr CMT_INLINE vec<T, N> broadcast(T x)
 {
-    return static_cast<To>(value);
+    return (simd<T, N>)(x);
 }
-template <typename To, typename From, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To bitcast(From value) noexcept
+
+#pragma clang diagnostic pop
+
+namespace internal
 {
-    union {
-        From from;
-        To to;
-    } u{ value };
-    return u.to;
+
+template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>,
+          size_t Nout = N* compound_type_traits<To>::deep_width>
+constexpr CMT_INLINE vec<To, N> builtin_convertvector(const vec<From, N>& value) noexcept
+{
+    return __builtin_convertvector(*value, simd<Tsub, Nout>);
 }
 
-template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To ubitcast(From value) noexcept
+// scalar to scalar
+template <typename To, typename From>
+struct conversion
 {
-    return bitcast<To>(value);
+    static_assert(std::is_convertible<From, To>::value, "");
+    static To cast(const From& value) { return value; }
+};
+
+// vector to vector
+template <typename To, typename From, size_t N>
+struct conversion<vec<To, N>, vec<From, N>>
+{
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<To, N> cast(const vec<From, N>& value) { return builtin_convertvector<To>(value); }
+};
+
+// vector<vector> to vector<vector>
+template <typename To, typename From, size_t N1, size_t N2>
+struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, N1>, N2>>
+{
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value)
+    {
+        return builtin_convertvector<vec<To, N1>>(value);
+    }
+};
+
+// scalar to vector
+template <typename To, typename From, size_t N>
+struct conversion<vec<To, N>, From>
+{
+    static_assert(std::is_convertible<From, To>::value, "");
+    static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
+};
+
+// mask to mask
+template <typename To, typename From, size_t N>
+struct conversion<mask<To, N>, mask<From, N>>
+{
+    static_assert(sizeof(To) == sizeof(From), "");
+    static mask<To, N> cast(const mask<From, N>& value) { return reinterpret_cast<simd<To, N>>(*value); }
+};
 }
 
-template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To ibitcast(From value) noexcept
+template <typename T>
+constexpr size_t size_of() noexcept
 {
-    return bitcast<To>(value);
+    return sizeof(deep_subtype<T>) * compound_type_traits<T>::deep_width;
 }
 
-template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr KFR_INLINE To fbitcast(From value) noexcept
+template <typename From, size_t N, typename Tsub = deep_subtype<From>,
+          size_t Nout = N* size_of<From>() / size_of<Tsub>()>
+constexpr CMT_INLINE vec<Tsub, Nout> flatten(const vec<From, N>& value) noexcept
 {
-    return bitcast<To>(value);
+    return *value;
 }
 
-template <typename To, typename From, size_t N, KFR_ENABLE_IF(!is_compound<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(const vec<From, N>& value) noexcept
+template <typename To, typename From, typename Tout = deep_rebind<From, To>>
+constexpr CMT_INLINE Tout cast(const From& value) noexcept
 {
-    return __builtin_convertvector(*value, simd<To, N>);
+    return static_cast<Tout>(value);
 }
-template <typename To, typename From, simdindex N>
-constexpr KFR_INLINE simd<To, N> cast(const simd<From, N>& value) noexcept
+
+template <typename To, typename From>
+constexpr CMT_INLINE To bitcast(const From& value) noexcept
 {
-    return __builtin_convertvector(value, simd<To, N>);
+    static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types");
+    union {
+        From from;
+        To to;
+    } u{ value };
+    return u.to;
 }
-template <typename To, typename From, size_t N, size_t Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
+
+template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
+constexpr CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return reinterpret_cast<typename vec<To, Nout>::simd_t>(*value);
 }
-template <typename To, typename From, simdindex N, simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> bitcast(const simd<From, N>& value) noexcept
+
+template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
+constexpr CMT_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(value);
+    return reinterpret_cast<typename mask<To, Nout>::simd_t>(*value);
 }
 
-template <typename From, size_t N, typename To = utype<From>, size_t Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept
+template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr CMT_INLINE To ubitcast(const From& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return bitcast<To>(value);
 }
 
-template <typename From, size_t N, typename To = itype<From>, size_t Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept
+template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr CMT_INLINE To ibitcast(const From& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return bitcast<To>(value);
 }
 
-template <typename From, size_t N, typename To = ftype<From>, size_t Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
+template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr CMT_INLINE To fbitcast(const From& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return bitcast<To>(value);
 }
 
-template <typename From, simdindex N, typename To = utype<From>,
-          simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> ubitcast(const simd<From, N>& value) noexcept
+template <typename From, size_t N, typename To = utype<From>,
+          size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr CMT_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(value);
+    return reinterpret_cast<simd<To, Nout>>(*value);
 }
 
-template <typename From, simdindex N, typename To = itype<From>,
-          simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> ibitcast(const simd<From, N>& value) noexcept
+template <typename From, size_t N, typename To = itype<From>,
+          size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr CMT_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(value);
+    return reinterpret_cast<simd<To, Nout>>(*value);
 }
 
-template <typename From, simdindex N, typename To = ftype<From>,
-          simdindex Nout = sizeof(From) * N / sizeof(To)>
-constexpr KFR_INLINE simd<To, Nout> fbitcast(const simd<From, N>& value) noexcept
+template <typename From, size_t N, typename To = ftype<From>,
+          size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(value);
+    return reinterpret_cast<simd<To, Nout>>(*value);
 }
 
-constexpr KFR_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
+constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
 
-template <typename T, size_t N, size_t... Sizes, size_t Nout = N + csum(csizes<Sizes...>)>
-KFR_INLINE vec<T, Nout> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest);
+template <typename T, size_t N, size_t... Sizes>
+CMT_INLINE vec<T, N + csum(csizes<Sizes...>)> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest);
 
 namespace internal
 {
 template <size_t start = 0, size_t stride = 1>
 struct shuffle_index
 {
-    constexpr KFR_INLINE size_t operator()(size_t index) const { return start + index * stride; }
+    constexpr CMT_INLINE size_t operator()(size_t index) const { return start + index * stride; }
 };
 
 template <size_t count, size_t start = 0, size_t stride = 1>
@@ -339,30 +394,19 @@ struct shuffle_index_wrap
 }
 
 template <size_t count, typename T, size_t N, size_t Nout = N* count>
-KFR_INLINE vec<T, Nout> repeat(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> repeat(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index_wrap<N, 0, 1>>(x);
 }
 KFR_FN(repeat)
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-
-template <size_t N, typename T>
-constexpr KFR_INLINE vec<T, N> broadcast(T x)
-{
-    return (simd<T, N>)(x);
-}
-
-#pragma clang diagnostic pop
-
 template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)>
-KFR_INLINE vec<T, Nout> resize(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index_wrap<N, 0, 1>>(x);
 }
 template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout == N)>
-constexpr KFR_INLINE vec<T, Nout> resize(const vec<T, N>& x)
+constexpr CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x)
 {
     return x;
 }
@@ -372,13 +416,13 @@ namespace internal_read_write
 {
 
 template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-KFR_INLINE vec<T, N> read(const T* src)
+CMT_INLINE vec<T, N> read(const T* src)
 {
     return ptr_cast<vec_algn<subtype<T>, vec<T, N>::scalar_size(), A>>(src)->value;
 }
 
 template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N))>
-KFR_INLINE vec<T, N> read(const T* src)
+CMT_INLINE vec<T, N> read(const T* src)
 {
     constexpr size_t first = prev_poweroftwo(N);
     constexpr size_t rest  = N - first;
@@ -387,13 +431,13 @@ KFR_INLINE vec<T, N> read(const T* src)
 }
 
 template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-KFR_INLINE void write(T* dest, const vec<T, N>& value)
+CMT_INLINE void write(T* dest, const vec<T, N>& value)
 {
     ptr_cast<vec_algn<subtype<T>, vec<T, N>::scalar_size(), A>>(dest)->value = *value;
 }
 
 template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N))>
-KFR_INLINE void write(T* dest, const vec<T, N>& value)
+CMT_INLINE void write(T* dest, const vec<T, N>& value)
 {
     constexpr size_t first = prev_poweroftwo(N);
     constexpr size_t rest  = N - first;
@@ -422,7 +466,8 @@ private:
 template <typename T>
 struct vec_op
 {
-    using scalar_type = subtype<T>;
+    using scalar_type  = subtype<T>;
+    using uscalar_type = utype<scalar_type>;
 
     template <simdindex N>
     constexpr static simd<scalar_type, N> add(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
@@ -467,64 +512,67 @@ struct vec_op
     template <simdindex N>
     constexpr static simd<scalar_type, N> band(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(ubitcast(x) & ubitcast(y));
+        return reinterpret_cast<simd<scalar_type, N>>(reinterpret_cast<simd<uscalar_type, N>>(x) &
+                                                      reinterpret_cast<simd<uscalar_type, N>>(y));
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> bor(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(ubitcast(x) | ubitcast(y));
+        return reinterpret_cast<simd<scalar_type, N>>(reinterpret_cast<simd<uscalar_type, N>>(x) |
+                                                      reinterpret_cast<simd<uscalar_type, N>>(y));
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> bxor(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(ubitcast(x) ^ ubitcast(y));
+        return reinterpret_cast<simd<scalar_type, N>>(reinterpret_cast<simd<uscalar_type, N>>(x) ^
+                                                      reinterpret_cast<simd<uscalar_type, N>>(y));
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> bnot(simd<scalar_type, N> x) noexcept
     {
-        return bitcast<scalar_type>(~ubitcast(x));
+        return reinterpret_cast<simd<scalar_type, N>>(~reinterpret_cast<simd<uscalar_type, N>>(x));
     }
 
     template <simdindex N>
     constexpr static simd<scalar_type, N> eq(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x == y);
+        return reinterpret_cast<simd<scalar_type, N>>(x == y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> ne(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x != y);
+        return reinterpret_cast<simd<scalar_type, N>>(x != y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> lt(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x < y);
+        return reinterpret_cast<simd<scalar_type, N>>(x < y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> gt(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x > y);
+        return reinterpret_cast<simd<scalar_type, N>>(x > y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> le(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x <= y);
+        return reinterpret_cast<simd<scalar_type, N>>(x <= y);
     }
     template <simdindex N>
     constexpr static simd<scalar_type, N> ge(simd<scalar_type, N> x, simd<scalar_type, N> y) noexcept
     {
-        return bitcast<scalar_type>(x >= y);
+        return reinterpret_cast<simd<scalar_type, N>>(x >= y);
     }
 };
 
 namespace internal
 {
 template <typename T, typename... Args, size_t... indices, size_t N = 1 + sizeof...(Args)>
-constexpr KFR_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, const T& x, const Args&... rest)
+constexpr CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, const T& x, const Args&... rest)
 {
     constexpr size_t width = compound_type_traits<T>::width;
-    const std::tuple<const T&, const Args&...> list(x, rest...);
-    typename vec<T, N>::simd_t result{ compound_type_traits<T>::at(std::get<indices / width>(list),
+    const T list[]         = { x, rest... };
+    typename vec<T, N>::simd_t result{ compound_type_traits<T>::at(list[indices / width],
                                                                    indices % width)... };
     return result;
 }
@@ -536,26 +584,27 @@ constexpr KFR_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, const T& x
 /// @encode
 template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
           typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>
-constexpr KFR_INLINE vec<SubType, N> make_vector(const Arg& x, const Args&... rest)
+constexpr CMT_INLINE vec<SubType, N> make_vector(const Arg& x, const Args&... rest)
 {
     return internal::make_vector_impl<SubType>(csizeseq<N * widthof<SubType>()>, static_cast<SubType>(x),
                                                static_cast<SubType>(rest)...);
 }
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> make_vector(const vec<T, N>& x)
+constexpr CMT_INLINE vec<T, N> make_vector(const vec<T, N>& x)
 {
     return x;
 }
 template <typename T, T... Values, size_t N = sizeof...(Values)>
-constexpr KFR_INLINE vec<T, N> make_vector(cvals_t<T, Values...>)
+constexpr CMT_INLINE vec<T, N> make_vector(cvals_t<T, Values...>)
 {
     return make_vector<T>(Values...);
 }
 KFR_FN(make_vector)
 
 template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
-          typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>
-constexpr KFR_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest)
+          typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>,
+          KFR_ENABLE_IF(is_numeric<SubType>::value)>
+constexpr CMT_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest)
 {
     return internal::make_vector_impl<SubType>(csizeseq<N * widthof<SubType>()>, static_cast<SubType>(x),
                                                static_cast<SubType>(rest)...);
@@ -567,6 +616,10 @@ struct vec : vec_t<T, N>
 {
     static_assert(N > 0 && N <= 256, "Invalid vector size");
 
+    static_assert(!is_vec<T>::value || is_poweroftwo(size_of<T>()),
+                  "Inner vector size must be a power of two");
+
+    using UT          = utype<T>;
     using value_type  = T;
     using scalar_type = subtype<T>;
     constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; }
@@ -576,94 +629,98 @@ struct vec : vec_t<T, N>
 
     constexpr static bool is_pod = true;
 
-    constexpr KFR_INLINE vec() noexcept {}
-    constexpr KFR_INLINE vec(simd_t value) noexcept : v(value) {}
-    constexpr KFR_INLINE vec(const array_ref<T>& value) noexcept
+    constexpr CMT_INLINE vec() noexcept {}
+    constexpr CMT_INLINE vec(simd_t value) noexcept : v(value) {}
+    constexpr CMT_INLINE vec(const array_ref<T>& value) noexcept
+        : v(*internal_read_write::read<N, false>(value.data()))
+    {
+    }
+    constexpr CMT_INLINE vec(const array_ref<const T>& value) noexcept
         : v(*internal_read_write::read<N, false>(value.data()))
     {
     }
     template <typename U,
               KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width > 1)>
-    constexpr KFR_INLINE vec(const U& value) noexcept
+    constexpr CMT_INLINE vec(const U& value) noexcept
         : v(*resize<scalar_size()>(bitcast<scalar_type>(make_vector(static_cast<T>(value)))))
     {
     }
     template <typename U,
               KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width == 1)>
-    constexpr KFR_INLINE vec(const U& value) noexcept : v(static_cast<T>(value))
+    constexpr CMT_INLINE vec(const U& value) noexcept : v(static_cast<T>(value))
     {
     }
     template <typename... Ts>
-    constexpr KFR_INLINE vec(const T& x, const T& y, const Ts&... rest) noexcept
+    constexpr CMT_INLINE vec(const T& x, const T& y, const Ts&... rest) noexcept
         : v(*make_vector<T>(x, y, rest...))
     {
         static_assert(N <= 2 + sizeof...(Ts), "Too few initializers for vec");
     }
     template <size_t N1, size_t N2, size_t... Ns>
-    constexpr KFR_INLINE vec(const vec<T, N1>& v1, const vec<T, N2>& v2,
+    constexpr CMT_INLINE vec(const vec<T, N1>& v1, const vec<T, N2>& v2,
                              const vec<T, Ns>&... vectors) noexcept : v(*concat(v1, v2, vectors...))
     {
         static_assert(csum(csizes<N1, N2, Ns...>) == N, "Can't concat vectors: invalid csizes");
     }
-    constexpr KFR_INLINE vec(const vec&) noexcept = default;
-    constexpr KFR_INLINE vec(vec&&) noexcept      = default;
-    constexpr KFR_INLINE vec& operator=(const vec&) noexcept = default;
-    constexpr KFR_INLINE vec& operator=(vec&&) noexcept = default;
-
-    friend constexpr KFR_INLINE vec operator+(const vec& x, const vec& y) { return vec_op<T>::add(x.v, y.v); }
-    friend constexpr KFR_INLINE vec operator-(const vec& x, const vec& y) { return vec_op<T>::sub(x.v, y.v); }
-    friend constexpr KFR_INLINE vec operator*(const vec& x, const vec& y) { return vec_op<T>::mul(x.v, y.v); }
-    friend constexpr KFR_INLINE vec operator/(const vec& x, const vec& y) { return vec_op<T>::div(x.v, y.v); }
-    friend constexpr KFR_INLINE vec operator%(const vec& x, const vec& y) { return vec_op<T>::rem(x.v, y.v); }
-    friend constexpr KFR_INLINE vec operator-(const vec& x) { return vec_op<T>::neg(x.v); }
-
-    friend constexpr KFR_INLINE vec operator&(const vec& x, const vec& y)
+    constexpr CMT_INLINE vec(const vec&) noexcept = default;
+    constexpr CMT_INLINE vec(vec&&) noexcept      = default;
+    constexpr CMT_INLINE vec& operator=(const vec&) noexcept = default;
+    constexpr CMT_INLINE vec& operator=(vec&&) noexcept = default;
+
+    friend constexpr CMT_INLINE vec operator+(const vec& x, const vec& y) { return vec_op<T>::add(x.v, y.v); }
+    friend constexpr CMT_INLINE vec operator-(const vec& x, const vec& y) { return vec_op<T>::sub(x.v, y.v); }
+    friend constexpr CMT_INLINE vec operator*(const vec& x, const vec& y) { return vec_op<T>::mul(x.v, y.v); }
+    friend constexpr CMT_INLINE vec operator/(const vec& x, const vec& y) { return vec_op<T>::div(x.v, y.v); }
+    friend constexpr CMT_INLINE vec operator%(const vec& x, const vec& y) { return vec_op<T>::rem(x.v, y.v); }
+    friend constexpr CMT_INLINE vec operator-(const vec& x) { return vec_op<T>::neg(x.v); }
+
+    friend constexpr CMT_INLINE vec operator&(const vec& x, const vec& y)
     {
         return vec_op<T>::band(x.v, y.v);
     }
-    friend constexpr KFR_INLINE vec operator|(const vec& x, const vec& y) { return vec_op<T>::bor(x.v, y.v); }
-    friend constexpr KFR_INLINE vec operator^(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE vec operator|(const vec& x, const vec& y) { return vec_op<T>::bor(x.v, y.v); }
+    friend constexpr CMT_INLINE vec operator^(const vec& x, const vec& y)
     {
         return vec_op<T>::bxor(x.v, y.v);
     }
-    friend constexpr KFR_INLINE vec operator~(const vec& x) { return vec_op<T>::bnot(x.v); }
+    friend constexpr CMT_INLINE vec operator~(const vec& x) { return vec_op<T>::bnot(x.v); }
 
-    friend constexpr KFR_INLINE vec operator<<(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE vec operator<<(const vec& x, const vec& y)
     {
         return vec_op<T>::shl(x.v, y.v);
     }
-    friend constexpr KFR_INLINE vec operator>>(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE vec operator>>(const vec& x, const vec& y)
     {
         return vec_op<T>::shr(x.v, y.v);
     }
 
-    friend constexpr KFR_INLINE mask<T, N> operator==(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE mask<T, N> operator==(const vec& x, const vec& y)
     {
         return vec_op<T>::eq(x.v, y.v);
     }
-    friend constexpr KFR_INLINE mask<T, N> operator!=(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE mask<T, N> operator!=(const vec& x, const vec& y)
     {
         return vec_op<T>::ne(x.v, y.v);
     }
-    friend constexpr KFR_INLINE mask<T, N> operator<(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE mask<T, N> operator<(const vec& x, const vec& y)
     {
         return vec_op<T>::lt(x.v, y.v);
     }
-    friend constexpr KFR_INLINE mask<T, N> operator>(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE mask<T, N> operator>(const vec& x, const vec& y)
     {
         return vec_op<T>::gt(x.v, y.v);
     }
-    friend constexpr KFR_INLINE mask<T, N> operator<=(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE mask<T, N> operator<=(const vec& x, const vec& y)
     {
         return vec_op<T>::le(x.v, y.v);
     }
-    friend constexpr KFR_INLINE mask<T, N> operator>=(const vec& x, const vec& y)
+    friend constexpr CMT_INLINE mask<T, N> operator>=(const vec& x, const vec& y)
     {
         return vec_op<T>::ge(x.v, y.v);
     }
 
 #define KFR_ASGN_OP(aop, op)                                                                                 \
-    friend KFR_INLINE vec& operator aop(vec& x, const vec& y)                                                \
+    friend CMT_INLINE vec& operator aop(vec& x, const vec& y)                                                \
     {                                                                                                        \
         x = x op y;                                                                                          \
         return x;                                                                                            \
@@ -678,22 +735,49 @@ struct vec : vec_t<T, N>
     KFR_ASGN_OP(^=, ^)
     KFR_ASGN_OP(<<=, <<)
     KFR_ASGN_OP(>>=, >>)
+#undef KFR_ASGN_OP
+
+    template <typename U, typename C = common_type<U, T>>
+    friend constexpr CMT_INLINE vec<C, N> operator+(const vec& x, const vec<U, N>& y)
+    {
+        return vec_op<C>::add(static_cast<vec<C, N>>(x).v, static_cast<vec<C, N>>(y).v);
+    }
+    template <typename U, typename C = common_type<U, T>>
+    friend constexpr CMT_INLINE vec<C, N> operator-(const vec& x, const vec<U, N>& y)
+    {
+        return vec_op<C>::sub(static_cast<vec<C, N>>(x).v, static_cast<vec<C, N>>(y).v);
+    }
+    template <typename U, typename C = common_type<U, T>>
+    friend constexpr CMT_INLINE vec<C, N> operator*(const vec& x, const vec<U, N>& y)
+    {
+        return vec_op<C>::mul(static_cast<vec<C, N>>(x).v, static_cast<vec<C, N>>(y).v);
+    }
+    template <typename U, typename C = common_type<U, T>>
+    friend constexpr CMT_INLINE vec<C, N> operator/(const vec& x, const vec<U, N>& y)
+    {
+        return vec_op<C>::div(static_cast<vec<C, N>>(x).v, static_cast<vec<C, N>>(y).v);
+    }
+    template <typename U, typename C = common_type<U, T>>
+    friend constexpr CMT_INLINE vec<C, N> operator%(const vec& x, const vec<U, N>& y)
+    {
+        return vec_op<C>::rem(static_cast<vec<C, N>>(x).v, static_cast<vec<C, N>>(y).v);
+    }
 
-    constexpr KFR_INLINE simd_t operator*() const { return v; }
-    constexpr KFR_INLINE simd_t& operator*() { return v; }
-    KFR_INLINE mask<T, N>& asmask() { return ref_cast<mask<T, N>>(*this); }
-    KFR_INLINE const mask<T, N>& asmask() const { return ref_cast<mask<T, N>>(*this); }
-    KFR_INLINE value_type operator[](size_t index) const { return data()[index]; }
+    constexpr CMT_INLINE simd_t operator*() const { return v; }
+    constexpr CMT_INLINE simd_t& operator*() { return v; }
+    CMT_INLINE mask<T, N>& asmask() { return ref_cast<mask<T, N>>(*this); }
+    CMT_INLINE const mask<T, N>& asmask() const { return ref_cast<mask<T, N>>(*this); }
+    CMT_INLINE value_type operator[](size_t index) const { return data()[index]; }
 
-    KFR_INLINE value_type* data() { return ptr_cast<T>(&v); }
-    KFR_INLINE const T* data() const { return ptr_cast<T>(&v); }
+    CMT_INLINE value_type* data() { return ptr_cast<T>(&v); }
+    CMT_INLINE const T* data() const { return ptr_cast<T>(&v); }
     using array_t = T (&)[N];
-    KFR_INLINE array_t arr() { return ref_cast<array_t>(v); }
+    CMT_INLINE array_t arr() { return ref_cast<array_t>(v); }
 
-    template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value)>
+    template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value && !std::is_same<U, vec>::value)>
     constexpr operator vec<U, N>() const noexcept
     {
-        return cast<U>(*this);
+        return internal::conversion<vec<U, N>, vec<T, N>>::cast(*this);
     }
 
 private:
@@ -714,12 +798,12 @@ private:
     struct getter_setter
     {
         constexpr getter_setter(simd_t& v, size_t index) noexcept : v(v), index(index) {}
-        KFR_INLINE getter_setter& operator=(scalar_type value) noexcept
+        CMT_INLINE getter_setter& operator=(scalar_type value) noexcept
         {
             v[index] = value;
             return *this;
         }
-        KFR_INLINE operator scalar_type() const { return v[index]; }
+        CMT_INLINE operator scalar_type() const { return v[index]; }
     private:
         friend struct vec;
         simd_t& v;
@@ -730,72 +814,69 @@ private:
 template <typename T, size_t N>
 struct mask : public vec<T, N>
 {
+    using UT                      = utype<T>;
     using type                    = T;
     constexpr static size_t width = N;
 
     using base = vec<T, N>;
 
-    constexpr KFR_INLINE mask() noexcept : base() {}
+    constexpr CMT_INLINE mask() noexcept : base() {}
 
-    constexpr KFR_INLINE mask(simd<T, N> value) noexcept : base(value) {}
+    constexpr CMT_INLINE mask(simd<T, N> value) noexcept : base(value) {}
     template <size_t N1, size_t... Ns>
-    constexpr KFR_INLINE mask(const mask<T, N1>& mask1, const mask<T, Ns>&... masks) noexcept
+    constexpr CMT_INLINE mask(const mask<T, N1>& mask1, const mask<T, Ns>&... masks) noexcept
         : base(*concat(mask1, masks...))
     {
     }
     template <typename... Ts, typename = enable_if<sizeof...(Ts) + 2 == N>>
-    constexpr KFR_INLINE mask(bool x, bool y, Ts... rest) noexcept
+    constexpr CMT_INLINE mask(bool x, bool y, Ts... rest) noexcept
         : base{ internal::maskbits<T>(x), internal::maskbits<T>(y), internal::maskbits<T>(rest)... }
     {
     }
-    constexpr KFR_INLINE mask(const mask&) noexcept = default;
-    constexpr KFR_INLINE mask(mask&&) noexcept      = default;
-    KFR_INLINE mask& operator=(const mask&) noexcept = default;
-    KFR_INLINE mask& operator=(mask&&) noexcept = default;
+    constexpr CMT_INLINE mask(const mask&) noexcept = default;
+    constexpr CMT_INLINE mask(mask&&) noexcept      = default;
+    CMT_INLINE mask& operator=(const mask&) noexcept = default;
+    CMT_INLINE mask& operator=(mask&&) noexcept = default;
 
     template <typename M, KFR_ENABLE_IF(sizeof(T) == sizeof(M))>
-    constexpr KFR_INLINE mask(const vec<M, N>& value) : base(bitcast<T>(value))
+    constexpr CMT_INLINE mask(const vec<M, N>& value) : base(bitcast<T>(value))
     {
     }
 
-    //    template <typename M, typename = u8[sizeof(T) == sizeof(M)]>
-    //    constexpr KFR_INLINE mask(mask<M, N> value) : base(reinterpret_cast<const vec<T, N>&>(value))
-    //    {
-    //    }
-    constexpr KFR_INLINE mask operator~() const { return bitcast<T>(~ubitcast(this->v)); }
-    constexpr KFR_INLINE mask operator&(const vec<T, N>& x) const
+    friend constexpr CMT_INLINE mask operator&(const mask& x, const mask& y)
     {
-        return bitcast<T>(ubitcast(this->v) & ubitcast(x.v));
+        return vec_op<T>::band(x.v, y.v);
     }
-    constexpr KFR_INLINE mask operator|(const vec<T, N>& x) const
+    friend constexpr CMT_INLINE mask operator|(const mask& x, const mask& y)
     {
-        return bitcast<T>(ubitcast(this->v) | ubitcast(x.v));
+        return vec_op<T>::bor(x.v, y.v);
     }
-    constexpr KFR_INLINE mask operator^(const vec<T, N>& x) const
+    friend constexpr CMT_INLINE mask operator^(const mask& x, const mask& y)
     {
-        return bitcast<T>(ubitcast(this->v) ^ ubitcast(x.v));
+        return vec_op<T>::bxor(x.v, y.v);
     }
+    friend constexpr CMT_INLINE mask operator~(const mask& x) { return vec_op<T>::bnot(x.v); }
 
-    constexpr KFR_INLINE mask operator&&(const mask& x) const { return *this & x; }
-    constexpr KFR_INLINE mask operator||(const mask& x) const { return *this | x; }
-    constexpr KFR_INLINE mask operator!() const { return ~*this; }
+    constexpr CMT_INLINE mask operator&&(const mask& x) const { return *this & x; }
+    constexpr CMT_INLINE mask operator||(const mask& x) const { return *this | x; }
+    constexpr CMT_INLINE mask operator!() const { return ~*this; }
 
-    constexpr KFR_INLINE simd<T, N> operator*() const { return this->v; }
+    constexpr CMT_INLINE simd<T, N> operator*() const { return this->v; }
 
-    KFR_INLINE vec<T, N>& asvec() { return ref_cast<mask>(*this); }
-    KFR_INLINE const vec<T, N>& asvec() const { return ref_cast<mask>(*this); }
+    CMT_INLINE vec<T, N>& asvec() { return ref_cast<mask>(*this); }
+    CMT_INLINE const vec<T, N>& asvec() const { return ref_cast<mask>(*this); }
 
     template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
-    KFR_INLINE operator mask<U, N>() const
+    CMT_INLINE operator mask<U, N>() const
     {
         return bitcast<U>(*this);
     }
 
-    KFR_INLINE bool operator[](size_t index) const { return ibitcast(this->v[index]) < 0; }
+    CMT_INLINE bool operator[](size_t index) const { return ibitcast(this->v[index]) < 0; }
 };
 
-template <typename T, size_t N>
-using cvec = vec<T, N * 2>;
+template <typename T, size_t N1, size_t N2 = N1>
+using mat = vec<vec<T, N1>, N2>;
 
 namespace internal
 {
@@ -803,31 +884,31 @@ namespace internal
 template <size_t start, size_t count>
 struct shuffle_index_extend
 {
-    constexpr KFR_INLINE size_t operator()(size_t index) const
+    constexpr CMT_INLINE size_t operator()(size_t index) const
     {
         return index >= start && index < start + count ? index - start : index_undefined;
     }
 };
 
 template <size_t start, size_t count, typename T, size_t N>
-KFR_INLINE vec<T, count> concatexact(const vec<T, N>& x, const vec<T, N>& y)
+CMT_INLINE vec<T, count> concatexact(const vec<T, N>& x, const vec<T, N>& y)
 {
     return kfr::shufflevector<count, internal::shuffle_index<start>>(x, y);
 }
 
 template <size_t start, size_t count, typename T, size_t N1, size_t N2>
-KFR_INLINE enable_if<(N1 == N2), vec<T, count>> concattwo(const vec<T, N1>& x, const vec<T, N2>& y)
+CMT_INLINE enable_if<(N1 == N2), vec<T, count>> concattwo(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     return concatexact<start, count>(x, y);
 }
 
 template <size_t start, size_t count, typename T, size_t N1, size_t N2>
-KFR_INLINE enable_if<(N1 > N2), vec<T, count>> concattwo(const vec<T, N1>& x, const vec<T, N2>& y)
+CMT_INLINE enable_if<(N1 > N2), vec<T, count>> concattwo(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     return concatexact<start, count>(x, shufflevector<N1, internal::shuffle_index_extend<0, N2>>(y));
 }
 template <size_t start, size_t count, typename T, size_t N1, size_t N2>
-KFR_INLINE enable_if<(N1 < N2), vec<T, count>> concattwo(const vec<T, N1>& x, const vec<T, N2>& y)
+CMT_INLINE enable_if<(N1 < N2), vec<T, count>> concattwo(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     return concatexact<N2 - N1 + start, count>(
         shufflevector<N2, internal::shuffle_index_extend<N2 - N1, N1>>(x), y);
@@ -845,26 +926,26 @@ constexpr mask<T, Nout> partial_mask()
 }
 
 template <typename T, size_t N>
-KFR_INLINE vec<T, N> concat(const vec<T, N>& x)
+CMT_INLINE vec<T, N> concat(const vec<T, N>& x)
 {
     return x;
 }
 
 template <typename T, size_t N1, size_t N2>
-KFR_INLINE vec<T, N1 + N2> concat(const vec<T, N1>& x, const vec<T, N2>& y)
+CMT_INLINE vec<T, N1 + N2> concat(const vec<T, N1>& x, const vec<T, N2>& y)
 {
     return concattwo<0, N1 + N2>(x, y);
 }
 
 template <typename T, size_t N1, size_t N2, size_t... Sizes>
-KFR_INLINE auto concat(const vec<T, N1>& x, const vec<T, N2>& y, const vec<T, Sizes>&... args)
+CMT_INLINE auto concat(const vec<T, N1>& x, const vec<T, N2>& y, const vec<T, Sizes>&... args)
 {
     return concat(x, concat(y, args...));
 }
 }
 
-template <typename T, size_t N, size_t... Sizes, size_t Nout>
-KFR_INLINE vec<T, Nout> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest)
+template <typename T, size_t N, size_t... Sizes>
+CMT_INLINE vec<T, N + csum(csizes<Sizes...>)> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest)
 {
     return internal::concat(x, rest...);
 }
@@ -1012,6 +1093,28 @@ using mu64x8  = mask<u64, 8>;
 using mu64x16 = mask<u64, 16>;
 using mu64x32 = mask<u64, 32>;
 
+using u8x2x2  = vec<vec<u8, 2>, 2>;
+using i8x2x2  = vec<vec<i8, 2>, 2>;
+using u16x2x2 = vec<vec<u16, 2>, 2>;
+using i16x2x2 = vec<vec<i16, 2>, 2>;
+using u32x2x2 = vec<vec<u32, 2>, 2>;
+using i32x2x2 = vec<vec<i32, 2>, 2>;
+using u64x2x2 = vec<vec<u64, 2>, 2>;
+using i64x2x2 = vec<vec<i64, 2>, 2>;
+using f32x2x2 = vec<vec<f32, 2>, 2>;
+using f64x2x2 = vec<vec<f64, 2>, 2>;
+
+using u8x4x4  = vec<vec<u8, 4>, 4>;
+using i8x4x4  = vec<vec<i8, 4>, 4>;
+using u16x4x4 = vec<vec<u16, 4>, 4>;
+using i16x4x4 = vec<vec<i16, 4>, 4>;
+using u32x4x4 = vec<vec<u32, 4>, 4>;
+using i32x4x4 = vec<vec<i32, 4>, 4>;
+using u64x4x4 = vec<vec<u64, 4>, 4>;
+using i64x4x4 = vec<vec<i64, 4>, 4>;
+using f32x4x4 = vec<vec<f32, 4>, 4>;
+using f64x4x4 = vec<vec<f64, 4>, 4>;
+
 namespace glsl_names
 {
 using vec2  = f32x2;
@@ -1117,19 +1220,19 @@ struct maxvec
 
 template <size_t Index, typename T, size_t N, typename Fn, typename... Args,
           typename Tout = result_of<Fn(subtype<decay<Args>>...)>>
-constexpr KFR_INLINE Tout applyfn_helper(Fn&& fn, Args&&... args)
+constexpr CMT_INLINE Tout applyfn_helper(Fn&& fn, Args&&... args)
 {
     return fn(args[Index]...);
 }
 
 template <typename T, size_t N, typename Fn, typename... Args,
           typename Tout = result_of<Fn(subtype<decay<Args>>...)>, size_t... Indices>
-constexpr KFR_INLINE vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args)
+constexpr CMT_INLINE vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args)
 {
     return make_vector(applyfn_helper<Indices, T, N>(std::forward<Fn>(fn), std::forward<Args>(args)...)...);
 }
 template <typename T, size_t N, typename Fn, size_t... Indices>
-constexpr KFR_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
+constexpr CMT_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
 {
     return make_vector(((void)Indices, void(), fn())...);
 }
@@ -1137,30 +1240,30 @@ constexpr KFR_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
 
 template <typename T, size_t N, typename Fn, typename... Args,
           typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
-constexpr KFR_INLINE vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args)
+constexpr CMT_INLINE vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args)
 {
     return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>, arg, std::forward<Args>(args)...);
 }
 
 template <size_t N, typename Fn, typename T = result_of<Fn()>>
-constexpr KFR_INLINE vec<T, N> apply(Fn&& fn)
+constexpr CMT_INLINE vec<T, N> apply(Fn&& fn)
 {
     return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>);
 }
 
 template <typename T, int N>
-KFR_INLINE vec<T, N> tovec(simd<T, N> x)
+CMT_INLINE vec<T, N> tovec(simd<T, N> x)
 {
     return x;
 }
 
-#ifdef CID_ARCH_SSE2
-KFR_INLINE f32x4 tovec(__m128 x) { return f32x4(x); }
-KFR_INLINE f64x2 tovec(__m128d x) { return f64x2(x); }
+#ifdef CMT_ARCH_SSE2
+CMT_INLINE f32x4 tovec(__m128 x) { return f32x4(x); }
+CMT_INLINE f64x2 tovec(__m128d x) { return f64x2(x); }
 #endif
 
 template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)>
-constexpr KFR_INLINE mask<T, Nout> make_mask(bool arg, Args... args)
+constexpr CMT_INLINE mask<T, Nout> make_mask(bool arg, Args... args)
 {
     simd<T, Nout> temp{ internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))... };
     return temp;
@@ -1168,63 +1271,63 @@ constexpr KFR_INLINE mask<T, Nout> make_mask(bool arg, Args... args)
 KFR_FN(make_mask)
 
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> zerovector()
+constexpr CMT_INLINE vec<T, N> zerovector()
 {
     constexpr size_t width = N * compound_type_traits<T>::width;
-    return subcast<T>(vec<subtype<T>, width>(simd<subtype<T>, width>()));
+    return compcast<T>(vec<subtype<T>, width>(simd<subtype<T>, width>()));
 }
 
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> zerovector(vec_t<T, N>)
+constexpr CMT_INLINE vec<T, N> zerovector(vec_t<T, N>)
 {
     return zerovector<T, N>();
 }
 KFR_FN(zerovector)
 
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> allonesvector()
+constexpr CMT_INLINE vec<T, N> allonesvector()
 {
     return zerovector<T, N>() == zerovector<T, N>();
 }
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> allonesvector(vec_t<T, N>)
+constexpr CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>)
 {
     return allonesvector<T, N>();
 }
 KFR_FN(allonesvector)
 
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> undefinedvector()
+constexpr CMT_INLINE vec<T, N> undefinedvector()
 {
     return vec<T, N>{};
 }
 template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> undefinedvector(vec_t<T, N>)
+constexpr CMT_INLINE vec<T, N> undefinedvector(vec_t<T, N>)
 {
     return undefinedvector<T, N>();
 }
 KFR_FN(undefinedvector)
 
 template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
-KFR_INLINE vec<T, Nout> low(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> low(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index<>>(x);
 }
 
 template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
-KFR_INLINE vec_t<T, Nout> low(vec_t<T, N>)
+CMT_INLINE vec_t<T, Nout> low(vec_t<T, N>)
 {
     return {};
 }
 
 template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
-KFR_INLINE vec<T, Nout> high(const vec<T, N>& x)
+CMT_INLINE vec<T, Nout> high(const vec<T, N>& x)
 {
     return shufflevector<Nout, internal::shuffle_index<prev_poweroftwo(N - 1)>>(x);
 }
 
 template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
-KFR_INLINE vec_t<T, Nout> high(vec_t<T, N>)
+CMT_INLINE vec_t<T, Nout> high(vec_t<T, N>)
 {
     return {};
 }
@@ -1237,16 +1340,16 @@ namespace internal
 template <typename Fn>
 struct expression_lambda : input_expression
 {
-    KFR_INLINE expression_lambda(Fn&& fn) : fn(std::move(fn)) {}
+    CMT_INLINE expression_lambda(Fn&& fn) : fn(std::move(fn)) {}
 
     template <typename T, size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, cinput_t, size_t, vec_t<T, N>>::value)>
-    KFR_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> y) const
+    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> y) const
     {
         return fn(cinput, index, y);
     }
 
     template <typename T, size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, size_t>::value)>
-    KFR_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
     {
         vec<T, N> result;
         for (size_t i = 0; i < N; i++)
@@ -1256,7 +1359,7 @@ struct expression_lambda : input_expression
         return result;
     }
     template <typename T, size_t N, KFR_ENABLE_IF(N&& is_callable<Fn>::value)>
-    KFR_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+    CMT_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
     {
         vec<T, N> result;
         for (size_t i = 0; i < N; i++)
@@ -1285,10 +1388,12 @@ namespace cometa
 template <typename T, size_t N>
 struct compound_type_traits<kfr::simd<T, N>>
 {
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static size_t width   = N;
-    constexpr static bool is_scalar = false;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::simd<U, N>;
     template <typename U>
@@ -1300,10 +1405,12 @@ struct compound_type_traits<kfr::simd<T, N>>
 template <typename T, size_t N>
 struct compound_type_traits<kfr::vec<T, N>>
 {
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static size_t width   = N;
-    constexpr static bool is_scalar = false;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::vec<U, N>;
     template <typename U>
@@ -1315,10 +1422,12 @@ struct compound_type_traits<kfr::vec<T, N>>
 template <typename T, size_t N>
 struct compound_type_traits<kfr::mask<T, N>>
 {
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static size_t width   = N;
-    constexpr static bool is_scalar = false;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
     template <typename U>
     using rebind = kfr::mask<U, N>;
     template <typename U>
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -1,366 +1,395 @@
 #pragma once
 
+#ifdef LIBC_WORKAROUND_GETS
+extern char* gets(char* __s);
+#endif
+
 #if defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__x86_64__)
-#define CID_ARCH_X86 1
-#elif defined(__arm__) || defined(__arm64__) || defined(_M_ARM)
-#define CID_ARCH_ARM 1
+#define CMT_ARCH_X86 1
+#elif defined(__arm__) || defined(__arm64__) || defined(_M_ARM) || defined(__aarch64__)
+#define CMT_ARCH_ARM 1
 #endif
 
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
 #if defined(_M_X64) || defined(__x86_64__)
-#define CID_ARCH_X64 1
+#define CMT_ARCH_X64 1
 #else
-#define CID_ARCH_X32 1
-#endif
-
-#if defined __AVX512F__ && !defined CID_ARCH_AVX512
-#define CID_ARCH_AVX512 1
-#define CID_ARCH_AVX2 1
-#define CID_ARCH_AVX 1
-#define CID_ARCH_SSE42 1
-#define CID_ARCH_SSE41 1
-#define CID_ARCH_SSSE3 1
-#define CID_ARCH_SSE3 1
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-#if defined __AVX2__ && !defined CID_ARCH_AVX2
-#define CID_ARCH_AVX2 1
-#define CID_ARCH_AVX 1
-#define CID_ARCH_SSE42 1
-#define CID_ARCH_SSE41 1
-#define CID_ARCH_SSSE3 1
-#define CID_ARCH_SSE3 1
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-#if defined __AVX__ && !defined CID_ARCH_AVX
-#define CID_ARCH_AVX 1
-#define CID_ARCH_SSE42 1
-#define CID_ARCH_SSE41 1
-#define CID_ARCH_SSSE3 1
-#define CID_ARCH_SSE3 1
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-#if defined __SSE4_2__ && !defined CID_ARCH_SSE4_2
-#define CID_ARCH_SSE4_2 1
-#define CID_ARCH_SSE41 1
-#define CID_ARCH_SSSE3 1
-#define CID_ARCH_SSE3 1
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-#if defined __SSE4_1__ && !defined CID_ARCH_SSE4_1
-#define CID_ARCH_SSE4_1 1
-#define CID_ARCH_SSSE3 1
-#define CID_ARCH_SSE3 1
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-#if defined __SSSE3__ && !defined CID_ARCH_SSSE3
-#define CID_ARCH_SSSE3 1
-#define CID_ARCH_SSE3 1
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-#if defined __SSE3__ && !defined CID_ARCH_SSE3
-#define CID_ARCH_SSE3 1
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-#if (defined CID_ARCH_X64 || defined __SSE2__) && !defined CID_ARCH_SSE2
-#define CID_ARCH_SSE2 1
-#define CID_ARCH_SSE 1
-#endif
-
-#if (defined CID_ARCH_X64 || defined __SSE__) && !defined CID_ARCH_SSE1
-#define CID_ARCH_SSE 1
-#endif
-
-#if defined __FMA__ && !defined CID_ARCH_FMA
-#define CID_ARCH_FMA 1
-#endif
-
-#if defined __AES__ && !defined CID_ARCH_AES
-#define CID_ARCH_AES 1
-#endif
-
-#if defined __BMI__ && !defined CID_ARCH_BMI
-#define CID_ARCH_BMI 1
-#endif
-
-#if defined __BMI2__ && !defined CID_ARCH_BMI2
-#define CID_ARCH_BMI2 1
-#endif
-
-#if defined __LZCNT__ && !defined CID_ARCH_LZCNT
-#define CID_ARCH_LZCNT 1
-#endif
-
-#if defined CID_ARCH_AVX512
-#define CID_ARCH_NAME avx512
-#elif defined CID_ARCH_AVX2
-#define CID_ARCH_NAME avx2
-#elif defined CID_ARCH_AVX
-#define CID_ARCH_NAME avx
-#elif defined CID_ARCH_SSE4_1
-#define CID_ARCH_NAME sse41
-#elif defined CID_ARCH_SSSE3
-#define CID_ARCH_NAME ssse3
-#elif defined CID_ARCH_SSE3
-#define CID_ARCH_NAME sse3
-#elif defined CID_ARCH_SSE2
-#define CID_ARCH_NAME sse2
-#elif defined CID_ARCH_SSE
-#define CID_ARCH_NAME sse
-#endif
-
-#elif defined(CID_ARCH_ARM)
-
-#if defined(__arm64__)
-#define CID_ARCH_X64 1
+#define CMT_ARCH_X32 1
+#endif
+
+#if defined __AVX512F__ && !defined CMT_ARCH_AVX512
+#define CMT_ARCH_AVX512 1
+#define CMT_ARCH_AVX2 1
+#define CMT_ARCH_AVX 1
+#define CMT_ARCH_SSE42 1
+#define CMT_ARCH_SSE41 1
+#define CMT_ARCH_SSSE3 1
+#define CMT_ARCH_SSE3 1
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+#if defined __AVX2__ && !defined CMT_ARCH_AVX2
+#define CMT_ARCH_AVX2 1
+#define CMT_ARCH_AVX 1
+#define CMT_ARCH_SSE42 1
+#define CMT_ARCH_SSE41 1
+#define CMT_ARCH_SSSE3 1
+#define CMT_ARCH_SSE3 1
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+#if defined __AVX__ && !defined CMT_ARCH_AVX
+#define CMT_ARCH_AVX 1
+#define CMT_ARCH_SSE42 1
+#define CMT_ARCH_SSE41 1
+#define CMT_ARCH_SSSE3 1
+#define CMT_ARCH_SSE3 1
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+#if defined __SSE4_2__ && !defined CMT_ARCH_SSE4_2
+#define CMT_ARCH_SSE4_2 1
+#define CMT_ARCH_SSE41 1
+#define CMT_ARCH_SSSE3 1
+#define CMT_ARCH_SSE3 1
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+#if defined __SSE4_1__ && !defined CMT_ARCH_SSE4_1
+#define CMT_ARCH_SSE4_1 1
+#define CMT_ARCH_SSSE3 1
+#define CMT_ARCH_SSE3 1
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+#if defined __SSSE3__ && !defined CMT_ARCH_SSSE3
+#define CMT_ARCH_SSSE3 1
+#define CMT_ARCH_SSE3 1
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+#if defined __SSE3__ && !defined CMT_ARCH_SSE3
+#define CMT_ARCH_SSE3 1
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+#if (defined CMT_ARCH_X64 || defined __SSE2__) && !defined CMT_ARCH_SSE2
+#define CMT_ARCH_SSE2 1
+#define CMT_ARCH_SSE 1
+#endif
+
+#if (defined CMT_ARCH_X64 || defined __SSE__) && !defined CMT_ARCH_SSE1
+#define CMT_ARCH_SSE 1
+#endif
+
+#if defined __FMA__ && !defined CMT_ARCH_FMA
+#define CMT_ARCH_FMA 1
+#endif
+
+#if defined __AES__ && !defined CMT_ARCH_AES
+#define CMT_ARCH_AES 1
+#endif
+
+#if defined __BMI__ && !defined CMT_ARCH_BMI
+#define CMT_ARCH_BMI 1
+#endif
+
+#if defined __BMI2__ && !defined CMT_ARCH_BMI2
+#define CMT_ARCH_BMI2 1
+#endif
+
+#if defined __LZCNT__ && !defined CMT_ARCH_LZCNT
+#define CMT_ARCH_LZCNT 1
+#endif
+
+#if defined CMT_ARCH_AVX512
+#define CMT_ARCH_NAME avx512
+#elif defined CMT_ARCH_AVX2
+#define CMT_ARCH_NAME avx2
+#elif defined CMT_ARCH_AVX
+#define CMT_ARCH_NAME avx
+#elif defined CMT_ARCH_SSE4_1
+#define CMT_ARCH_NAME sse41
+#elif defined CMT_ARCH_SSSE3
+#define CMT_ARCH_NAME ssse3
+#elif defined CMT_ARCH_SSE3
+#define CMT_ARCH_NAME sse3
+#elif defined CMT_ARCH_SSE2
+#define CMT_ARCH_NAME sse2
+#elif defined CMT_ARCH_SSE
+#define CMT_ARCH_NAME sse
+#endif
+
+#elif defined(CMT_ARCH_ARM)
+
+#if defined(__aarch64__)
+#define CMT_ARCH_X64 1
 #else
-#define CID_ARCH_X32 1
+#define CMT_ARCH_X32 1
 #endif
 
 #ifdef __ARM_NEON__
 
 #if __ARM_ARCH >= 8 && defined(__aarch64__)
-#define CID_ARCH_NEON64 1
-#define CID_ARCH_NAME neon64
+#define CMT_ARCH_NEON64 1
+#define CMT_ARCH_NEON 1
+#define CMT_ARCH_NAME neon64
 #else
-#define CID_ARCH_NEON 1
-#define CID_ARCH_NAME neon
+#define CMT_ARCH_NEON 1
+#define CMT_ARCH_NAME neon
+#define KFR_NO_NATIVE_F64 1
+#endif
+#endif
+
 #endif
+
+#ifndef CMT_ARCH_NAME
+#define CMT_ARCH_NAME common
 #endif
 
+#ifndef KFR_NO_NATIVE_F64
+#define KFR_NATIVE_F64 1
 #endif
 
-#ifndef CID_ARCH_NAME
-#define CID_ARCH_NAME common
+#ifndef KFR_NO_NATIVE_I64
+#define KFR_NATIVE_I64 1
 #endif
 
-#define CID_STRINGIFY2(x) #x
-#define CID_STRINGIFY(x) CID_STRINGIFY2(x)
+#define CMT_STRINGIFY2(x) #x
+#define CMT_STRINGIFY(x) CMT_STRINGIFY2(x)
 
 #if defined(_WIN32) // Windows
-#define CID_OS_WIN 1
+#define CMT_OS_WIN 1
 #endif
 
 #if defined(__APPLE__)
 #include "TargetConditionals.h"
 #ifdef TARGET_OS_IPHONE
-#define CID_OS_IOS 1
-#define CID_OS_MOBILE 1
+#define CMT_OS_IOS 1
+#define CMT_OS_MOBILE 1
 #elif TARGET_IPHONE_SIMULATOR
-#define CID_OS_IOS 1
-#define CID_OS_IOS_SIMULATOR 1
-#define CID_OS_MOBILE 1
+#define CMT_OS_IOS 1
+#define CMT_OS_IOS_SIMULATOR 1
+#define CMT_OS_MOBILE 1
 #elif TARGET_OS_MAC
-#define CID_OS_MAC 1
-#define CID_OS_MACOS 1
-#define CID_OS_OSX 1
+#define CMT_OS_MAC 1
+#define CMT_OS_MACOS 1
+#define CMT_OS_OSX 1
 #endif
-#define CID_OS_POSIX 1
+#define CMT_OS_POSIX 1
 #endif
 
 #if defined(__ANDROID__)
-#define CID_OS_ANDROID 1
-#define CID_OS_MOBILE 1
-#define CID_OS_POSIX 1
+#define CMT_OS_ANDROID 1
+#define CMT_OS_MOBILE 1
+#define CMT_OS_POSIX 1
 #endif
 
 #if defined(__linux__)
-#define CID_OS_LINUX 1
-#define CID_OS_POSIX 1
+#define CMT_OS_LINUX 1
+#define CMT_OS_POSIX 1
 #endif
 
 #if defined(_MSC_VER) // Visual C/C++
-#define CID_COMPILER_MSVC 1
-#define CID_MSVC_ATTRIBUTES 1
-#define CID_MSC_VER _MSC_VER
+#define CMT_COMPILER_MSVC 1
+#define CMT_MSVC_ATTRIBUTES 1
+#define CMT_MSC_VER _MSC_VER
 #else
-#define CID_MSC_VER 0
+#define CMT_MSC_VER 0
 #endif
 
 #if defined(__GNUC__) || defined(__clang__) // GCC, Clang
-#define CID_COMPILER_GNU 1
-#define CID_GNU_ATTRIBUTES 1
-#define CID_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#define CMT_COMPILER_GNU 1
+#define CMT_GNU_ATTRIBUTES 1
+#define CMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #if __cplusplus >= 201103L || defined __GXX_EXPERIMENTAL_CXX0X__
-#define CID_HAS_GXX_CXX11 1
+#define CMT_HAS_GXX_CXX11 1
 #endif
 #else
-#define CID_GCC_VERSION 0
+#define CMT_GCC_VERSION 0
 #endif
 
 #if defined(__INTEL_COMPILER) // Intel Compiler
-#define CID_COMPILER_INTEL 1
-#define CID_ICC_VERSION __INTEL_COMPILER
+#define CMT_COMPILER_INTEL 1
+#define CMT_ICC_VERSION __INTEL_COMPILER
 #elif defined(__ICL)
-#define CID_COMPILER_INTEL 1
-#define CID_ICC_VERSION __ICL
+#define CMT_COMPILER_INTEL 1
+#define CMT_ICC_VERSION __ICL
 #else
-#define CID_ICC_VERSION 0
+#define CMT_ICC_VERSION 0
 #endif
 
 #if defined(__clang__) // Clang
-#define CID_COMPILER_CLANG 1
-#ifndef CID_GNU_ATTRIBUTES
-#define CID_GNU_ATTRIBUTES 1
+#define CMT_COMPILER_CLANG 1
+#ifndef CMT_GNU_ATTRIBUTES
+#define CMT_GNU_ATTRIBUTES 1
 #endif
 #endif
 
-#if defined(CID_GNU_ATTRIBUTES)
+#if defined(CMT_GNU_ATTRIBUTES)
 
-#define CID_NODEBUG
+#define CMT_NODEBUG
 // __attribute__((__nodebug__))
-#define CID_INLINE __inline__ __attribute__((__always_inline__))
-#define CID_INTRIN CID_INLINE CID_NODEBUG
-#define CID_INLINE_MEMBER __attribute__((__always_inline__))
-#define CID_INLINE_LAMBDA CID_INLINE_MEMBER
-#define CID_NOINLINE __attribute__((__noinline__))
-#define CID_FLATTEN __attribute__((__flatten__))
-#define CID_RESTRICT __restrict__
+#define CMT_INLINE __inline__ __attribute__((__always_inline__))
+#define CMT_INTRIN CMT_INLINE CMT_NODEBUG
+#define CMT_INLINE_MEMBER __attribute__((__always_inline__))
+#define CMT_INLINE_LAMBDA CMT_INLINE_MEMBER
+#define CMT_NOINLINE __attribute__((__noinline__))
+#define CMT_FLATTEN __attribute__((__flatten__))
+#define CMT_RESTRICT __restrict__
 
-#elif defined(CID_MSVC_ATTRIBUTES)
+#elif defined(CMT_MSVC_ATTRIBUTES)
 
-#define CID_NODEBUG
-#define CID_INLINE inline __forceinline
-#define CID_INTRIN CID_INLINE CID_NODEBUG
-#define CID_INLINE_MEMBER __forceinline
-#define CID_INLINE_LAMBDA
-#define CID_NOINLINE __declspec(noinline)
-#define CID_FLATTEN
-#define CID_RESTRICT __restrict
+#define CMT_NODEBUG
+#define CMT_INLINE inline __forceinline
+#define CMT_INTRIN CMT_INLINE CMT_NODEBUG
+#define CMT_INLINE_MEMBER __forceinline
+#define CMT_INLINE_LAMBDA
+#define CMT_NOINLINE __declspec(noinline)
+#define CMT_FLATTEN
+#define CMT_RESTRICT __restrict
 
 #endif
 
-#define CID_INLINE_STATIC CID_INLINE static
+#define CMT_INLINE_STATIC CMT_INLINE static
 
-#define CID_EXTERN_C extern "C"
+#define CMT_EXTERN_C extern "C"
 
-#define CID_PUBLIC_C CID_EXTERN_C CID_NOINLINE
+#define CMT_PUBLIC_C CMT_EXTERN_C CMT_NOINLINE
 
-#define CID_ALWAYS_INLINE_STATIC CID_ALWAYS_INLINE static
+#define CMT_ALWAYS_INLINE_STATIC CMT_ALWAYS_INLINE static
 
-#ifdef CID_ARCH_x86
-#ifdef CID_OS_WIN
-#define CID_CDECL __cdecl
+#ifdef CMT_ARCH_x86
+#ifdef CMT_OS_WIN
+#define CMT_CDECL __cdecl
 #else
-#define CID_CDECL __attribute__((cdecl))
+#define CMT_CDECL __attribute__((cdecl))
 #endif
 #else
-#define CID_CDECL
+#define CMT_CDECL
 #endif
 
-#ifdef CID_OS_WIN
-#if defined(CID_MSVC_ATTRIBUTES)
-#define CID_DLL_EXPORT __declspec(dllexport)
-#define CID_DLL_IMPORT __declspec(dllimport)
+#ifdef CMT_OS_WIN
+#if defined(CMT_MSVC_ATTRIBUTES)
+#define CMT_DLL_EXPORT __declspec(dllexport)
+#define CMT_DLL_IMPORT __declspec(dllimport)
 #else
-#define CID_DLL_EXPORT __attribute__((dllexport))
-#define CID_DLL_IMPORT __attribute__((dllimport))
+#define CMT_DLL_EXPORT __attribute__((dllexport))
+#define CMT_DLL_IMPORT __attribute__((dllimport))
 #endif
 #else
-#define CID_DLL_EXPORT
-#define CID_DLL_IMPORT
+#define CMT_DLL_EXPORT
+#define CMT_DLL_IMPORT
 #endif
 
 #ifdef __has_builtin
-#define CID_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#define CMT_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+#define CMT_HAS_BUILTIN(builtin) 0
+#endif
+
+#if CMT_HAS_BUILTIN(CMT_ASSUME)
+#define CMT_ASSUME(x) __builtin_assume(x)
+#else
+#define CMT_ASSUME(x)                                                                                        \
+    do                                                                                                       \
+    {                                                                                                        \
+    } while (0)
+#endif
+
+#if CMT_HAS_BUILTIN(CMT_ASSUME)
+#define CMT_ASSUME_ALIGNED(x, a) __builtin_assume_aligned(x, a)
 #else
-#define CID_HAS_BUILTIN(builtin) 0
+#define CMT_ASSUME_ALIGNED(x, a) x
 #endif
 
 #ifdef __has_feature
-#define CID_HAS_FEATURE(feature) __has_feature(feature)
+#define CMT_HAS_FEATURE(feature) __has_feature(feature)
 #else
-#define CID_HAS_FEATURE(feature) 0
+#define CMT_HAS_FEATURE(feature) 0
 #endif
 
 #ifdef __has_extension
-#define CID_HAS_EXTENSION(extension) __has_extension(extension)
+#define CMT_HAS_EXTENSION(extension) __has_extension(extension)
 #else
-#define CID_HAS_EXTENSION(extension) 0
+#define CMT_HAS_EXTENSION(extension) 0
 #endif
 
 #ifdef __has_attribute
-#define CID_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#define CMT_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
 #else
-#define CID_HAS_ATTRIBUTE(attribute) 0
+#define CMT_HAS_ATTRIBUTE(attribute) 0
 #endif
 
 #ifdef __has_warning
-#define CID_HAS_WARNING(warning) __has_warning(warning)
+#define CMT_HAS_WARNING(warning) __has_warning(warning)
 #else
-#define CID_HAS_WARNING(warning) 0
+#define CMT_HAS_WARNING(warning) 0
 #endif
 
-#define CID_HAS_VARIADIC_TEMPLATES                                                                           \
-    (CID_HAS_FEATURE(cxx_variadic_templates) || (CID_GCC_VERSION >= 404 && CID_HAS_GXX_CXX11) ||             \
-     CID_MSC_VER >= 1800)
+#define CMT_HAS_VARIADIC_TEMPLATES                                                                           \
+    (CMT_HAS_FEATURE(cxx_variadic_templates) || (CMT_GCC_VERSION >= 404 && CMT_HAS_GXX_CXX11) ||             \
+     CMT_MSC_VER >= 1800)
 
-#ifdef CID_BUILDING_DLL
-#define CID_C_API CID_DLL_EXPORT
+#ifdef CMT_BUILDING_DLL
+#define CMT_C_API CMT_DLL_EXPORT
 #else
-#define CID_C_API CID_DLL_IMPORT
+#define CMT_C_API CMT_DLL_IMPORT
 #endif
 
-#if __cplusplus >= 201103L || CID_MSC_VER >= 1900 || CID_HAS_FEATURE(cxx_constexpr)
-#define CID_HAS_CONSTEXPR 1
+#if __cplusplus >= 201103L || CMT_MSC_VER >= 1900 || CMT_HAS_FEATURE(cxx_constexpr)
+#define CMT_HAS_CONSTEXPR 1
 #endif
 
-#if __cpp_constexpr >= 201304 || CID_HAS_FEATURE(cxx_constexpr)
-#define CID_HAS_FULL_CONSTEXPR 1
+#if __cpp_constexpr >= 201304 || CMT_HAS_FEATURE(cxx_constexpr)
+#define CMT_HAS_FULL_CONSTEXPR 1
 #endif
 
-#if CID_HAS_CONSTEXPR
-#define CID_CONSTEXPR constexpr
+#if CMT_HAS_CONSTEXPR
+#define CMT_CONSTEXPR constexpr
 #else
-#define CID_CONSTEXPR
+#define CMT_CONSTEXPR
 #endif
 
-#if CID_HAS_FEATURE(cxx_noexcept) || (CID_GCC_VERSION >= 408 && CID_HAS_GXX_CXX11) || CID_MSC_VER >= 1900
-#define CID_HAS_NOEXCEPT 1
+#if CMT_HAS_FEATURE(cxx_noexcept) || (CMT_GCC_VERSION >= 408 && CMT_HAS_GXX_CXX11) || CMT_MSC_VER >= 1900
+#define CMT_HAS_NOEXCEPT 1
 #endif
 
-#if CID_HAS_NOEXCEPT
-#define CID_NOEXCEPT noexcept
+#if CMT_HAS_NOEXCEPT
+#define CMT_NOEXCEPT noexcept
 #else
-#define CID_NOEXCEPT
+#define CMT_NOEXCEPT
 #endif
 
-#if CID_COMPILER_GNU && !defined(__EXCEPTIONS)
-#define CID_HAS_EXCEPTIONS 0
+#if CMT_COMPILER_GNU && !defined(__EXCEPTIONS)
+#define CMT_HAS_EXCEPTIONS 0
 #endif
-#if CID_COMPILER_MSVC && !_HAS_EXCEPTIONS
-#define CID_HAS_EXCEPTIONS 0
+#if CMT_COMPILER_MSVC && !_HAS_EXCEPTIONS
+#define CMT_HAS_EXCEPTIONS 0
 #endif
 
-#ifndef CID_HAS_EXCEPTIONS
-#define CID_HAS_EXCEPTIONS 1
+#ifndef CMT_HAS_EXCEPTIONS
+#define CMT_HAS_EXCEPTIONS 1
 #endif
 
 #if __has_include(<assert.h>)
 #include <assert.h>
-#define CID_HAS_ASSERT_H 1
+#define CMT_HAS_ASSERT_H 1
 #endif
 
-#ifndef CID_THROW
-#if CID_HAS_EXCEPTIONS
-#define CID_THROW(x) throw x
+#ifndef CMT_THROW
+#if CMT_HAS_EXCEPTIONS
+#define CMT_THROW(x) throw x
 #else
-#ifdef CID_HAS_ASSERT_H
-#define CID_THROW(x) assert(false)
+#ifdef CMT_HAS_ASSERT_H
+#define CMT_THROW(x) assert(false)
 #else
-#define CID_THROW(x) abort()
+#define CMT_THROW(x) abort()
 #endif
 #endif
 #endif
 
-#if __cplusplus >= 201103L || CID_MSC_VER >= 1900 || CID_HAS_FEATURE(cxx_constexpr)
+#if __cplusplus >= 201103L || CMT_MSC_VER >= 1900 || CMT_HAS_FEATURE(cxx_constexpr)
 
 #include <cstdint>
 namespace cid
@@ -372,21 +401,42 @@ constexpr inline static size_t arraysize(const T (&)[N]) noexcept
 }
 }
 
-#define CID_ARRAYSIZE(arr) ::cid::arraysize(arr)
-#elif CID_COMPILER_MSVC
-#define CID_ARRAYSIZE(arr) _countof(arr)
+#define CMT_ARRAYSIZE(arr) ::cid::arraysize(arr)
+#elif CMT_COMPILER_MSVC
+#define CMT_ARRAYSIZE(arr) _countof(arr)
 #elif __cplusplus >= 199711L &&                                                                              \
     (defined(__INTEL_COMPILER) || defined(__clang__) ||                                                      \
      (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))))
 template <typename T, size_t N>
 char (&COUNTOF_REQUIRES_ARRAY_ARGUMENT(T (&)[N]))[N];
-#define CID_ARRAYSIZE(x) sizeof(COUNTOF_REQUIRES_ARRAY_ARGUMENT(x))
+#define CMT_ARRAYSIZE(x) sizeof(COUNTOF_REQUIRES_ARRAY_ARGUMENT(x))
+#else
+#define CMT_ARRAYSIZE(arr) sizeof(arr) / sizeof(arr[0])
+#endif
+
+#ifdef CMT_COMPILER_MSVC
+#define CMT_FUNC_SIGNATURE __FUNCSIG__
 #else
-#define CID_ARRAYSIZE(arr) sizeof(arr) / sizeof(arr[0])
+#define CMT_FUNC_SIGNATURE __PRETTY_FUNCTION__
+#endif
+
+#if CMT_COMPILER_CLANG
+#define CMT_LOOP_NOUNROLL                                                                                    \
+    _Pragma("clang loop vectorize( disable )") _Pragma("clang loop interleave( disable )")                   \
+        _Pragma("clang loop unroll( disable )")
+
+#define CMT_LOOP_UNROLL _Pragma("clang loop unroll( full )")
+#define CMT_VEC_CC __attribute__((vectorcall))
+#else
+#define CMT_LOOP_NOUNROLL
+#define CMT_LOOP_UNROLL
+#ifdef CMT_COMPILER_MSVC
+#define CMT_VEC_CC __vectorcall
+#endif
 #endif
 
-#ifdef CID_COMPILER_MSVC
-#define CID_FUNC_SIGNATURE __FUNCSIG__
+#if defined(CMT_GNU_ATTRIBUTES)
+#define CMT_FAST_CC __attribute__((fastcall))
 #else
-#define CID_FUNC_SIGNATURE __PRETTY_FUNCTION__
+#define CMT_FAST_CC __fastcall
 #endif
diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp
@@ -22,6 +22,18 @@ using pvoid = void*;
 template <typename...>
 using void_t = void;
 
+// Workaround for GCC 4.8
+template <typename T>
+constexpr const T& const_max(const T& x, const T& y)
+{
+    return x > y ? x : y;
+}
+template <typename T>
+constexpr const T& const_min(const T& x, const T& y)
+{
+    return x < y ? x : y;
+}
+
 namespace details
 {
 constexpr inline bool args_or() { return false; }
@@ -135,10 +147,12 @@ constexpr size_t typeindex()
 template <typename T>
 struct compound_type_traits
 {
-    constexpr static size_t width   = 1;
-    using subtype                   = T;
-    using deep_subtype              = T;
-    constexpr static bool is_scalar = true;
+    constexpr static size_t width      = 1;
+    constexpr static size_t deep_width = width;
+    using subtype                      = T;
+    using deep_subtype                 = T;
+    constexpr static size_t depth      = 0;
+    constexpr static bool is_scalar    = true;
 
     template <typename U>
     using rebind = U;
@@ -166,10 +180,12 @@ using deep_rebind = typename compound_type_traits<T>::template deep_rebind<SubTy
 template <typename T>
 struct compound_type_traits<std::pair<T, T>>
 {
-    constexpr static size_t width   = 2;
-    using subtype                   = T;
-    using deep_subtype              = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar = false;
+    constexpr static size_t width      = 2;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
 
     template <typename U>
     using rebind = std::pair<U, U>;
@@ -744,7 +760,7 @@ inline auto call_if_callable(Fn&& fn)
 template <typename Fn, typename... Args>
 inline auto bind_func(Fn&& fn, Args&&... args)
 {
-    return [=]() CID_INLINE_LAMBDA { return fn(details::call_if_callable(std::forward<Args>(args))...); };
+    return [=]() CMT_INLINE_LAMBDA { return fn(details::call_if_callable(std::forward<Args>(args))...); };
 }
 
 template <typename T>
@@ -880,7 +896,7 @@ using identity = typename details::identity_impl<T>::type;
 struct swallow
 {
     template <typename... T>
-    CID_INTRIN constexpr swallow(T&&...) noexcept
+    CMT_INTRIN constexpr swallow(T&&...) noexcept
     {
     }
 };
@@ -905,24 +921,24 @@ struct carray<T, 1>
     static constexpr size_t size() noexcept { return 1; }
 
     template <size_t index>
-    CID_INTRIN constexpr T& get(csize_t<index>) noexcept
+    CMT_INTRIN constexpr T& get(csize_t<index>) noexcept
     {
         static_assert(index == 0, "carray: Array index is out of range");
         return val;
     }
     template <size_t index>
-    CID_INTRIN constexpr const T& get(csize_t<index>) const noexcept
+    CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept
     {
         static_assert(index == 0, "carray: Array index is out of range");
         return val;
     }
     template <size_t index>
-    CID_INTRIN constexpr T& get() noexcept
+    CMT_INTRIN constexpr T& get() noexcept
     {
         return get(csize<index>);
     }
     template <size_t index>
-    CID_INTRIN constexpr const T& get() const noexcept
+    CMT_INTRIN constexpr const T& get() const noexcept
     {
         return get(csize<index>);
     }
@@ -960,39 +976,39 @@ struct carray : carray<T, N - 1>
     constexpr carray(const carray&) noexcept = default;
     constexpr carray(carray&&) noexcept      = default;
     static constexpr size_t size() noexcept { return N; }
-    CID_INTRIN constexpr T& get(csize_t<N - 1>) noexcept { return val; }
+    CMT_INTRIN constexpr T& get(csize_t<N - 1>) noexcept { return val; }
     template <size_t index>
-    CID_INTRIN constexpr T& get(csize_t<index>) noexcept
+    CMT_INTRIN constexpr T& get(csize_t<index>) noexcept
     {
         return carray<T, N - 1>::get(csize<index>);
     }
     template <size_t index>
-    CID_INTRIN constexpr T& get() noexcept
+    CMT_INTRIN constexpr T& get() noexcept
     {
         return get(csize<index>);
     }
-    CID_INTRIN constexpr const T& get(csize_t<N - 1>) const noexcept { return val; }
+    CMT_INTRIN constexpr const T& get(csize_t<N - 1>) const noexcept { return val; }
     template <size_t index>
-    CID_INTRIN constexpr const T& get(csize_t<index>) const noexcept
+    CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept
     {
         return carray<T, N - 1>::get(csize<index>);
     }
     template <size_t index>
-    CID_INTRIN constexpr const T& get() const noexcept
+    CMT_INTRIN constexpr const T& get() const noexcept
     {
         return get(csize<index>);
     }
-    CID_INTRIN constexpr const T* front() const noexcept { return carray<T, N - 1>::front(); }
-    CID_INTRIN constexpr T* front() noexcept { return carray<T, N - 1>::front(); }
-    CID_INTRIN constexpr const T* back() const noexcept { return val; }
-    CID_INTRIN constexpr T* back() noexcept { return val; }
-    CID_INTRIN constexpr const T* begin() const noexcept { return carray<T, N - 1>::begin(); }
-    CID_INTRIN constexpr const T* end() const noexcept { return &val + 1; }
-    CID_INTRIN constexpr T* begin() noexcept { return carray<T, N - 1>::begin(); }
-    CID_INTRIN constexpr T* end() noexcept { return &val + 1; }
-    CID_INTRIN constexpr const T* data() const noexcept { return begin(); }
-    CID_INTRIN constexpr T* data() noexcept { return begin(); }
-    CID_INTRIN constexpr bool empty() const noexcept { return false; }
+    CMT_INTRIN constexpr const T* front() const noexcept { return carray<T, N - 1>::front(); }
+    CMT_INTRIN constexpr T* front() noexcept { return carray<T, N - 1>::front(); }
+    CMT_INTRIN constexpr const T* back() const noexcept { return val; }
+    CMT_INTRIN constexpr T* back() noexcept { return val; }
+    CMT_INTRIN constexpr const T* begin() const noexcept { return carray<T, N - 1>::begin(); }
+    CMT_INTRIN constexpr const T* end() const noexcept { return &val + 1; }
+    CMT_INTRIN constexpr T* begin() noexcept { return carray<T, N - 1>::begin(); }
+    CMT_INTRIN constexpr T* end() noexcept { return &val + 1; }
+    CMT_INTRIN constexpr const T* data() const noexcept { return begin(); }
+    CMT_INTRIN constexpr T* data() noexcept { return begin(); }
+    CMT_INTRIN constexpr bool empty() const noexcept { return false; }
 private:
     T val;
 };
@@ -1001,7 +1017,7 @@ private:
     struct fn_##fn                                                                                           \
     {                                                                                                        \
         template <typename... Args>                                                                          \
-        CID_INLINE_MEMBER decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const             \
+        CMT_INLINE_MEMBER decltype(fn(std::declval<Args>()...)) operator()(Args&&... args) const             \
         {                                                                                                    \
             return fn(std::forward<Args>(args)...);                                                          \
         }                                                                                                    \
@@ -1014,7 +1030,7 @@ private:
     struct fn_##fn                                                                                           \
     {                                                                                                        \
         template <typename... Args>                                                                          \
-        CID_INLINE_MEMBER decltype(fn<CMT_ESC tpl_args>(std::declval<Args>()...)) operator()(                \
+        CMT_INLINE_MEMBER decltype(fn<CMT_ESC tpl_args>(std::declval<Args>()...)) operator()(                \
             Args&&... args) const                                                                            \
         {                                                                                                    \
             return fn<CMT_ESC tpl_args>(std::forward<Args>(args)...);                                        \
@@ -1156,19 +1172,19 @@ template <typename T>
 using value_type_of = typename decay<T>::value_type;
 
 template <typename T, typename Fn>
-CID_INTRIN void cforeach(cvals_t<T>, Fn&&)
+CMT_INTRIN void cforeach(cvals_t<T>, Fn&&)
 {
 }
 
 template <typename T, T v0, T... values, typename Fn>
-CID_INTRIN void cforeach(cvals_t<T, v0, values...>, Fn&& fn)
+CMT_INTRIN void cforeach(cvals_t<T, v0, values...>, Fn&& fn)
 {
     fn(cval<T, v0>);
     cforeach(cvals_t<T, values...>(), std::forward<Fn>(fn));
 }
 
 template <typename T, typename Fn, CMT_ENABLE_IF(has_begin_end<T>::value)>
-CID_INTRIN void cforeach(T&& list, Fn&& fn)
+CMT_INTRIN void cforeach(T&& list, Fn&& fn)
 {
     for (const auto& v : list)
     {
@@ -1177,7 +1193,7 @@ CID_INTRIN void cforeach(T&& list, Fn&& fn)
 }
 
 template <typename T, size_t N, typename Fn>
-CID_INTRIN void cforeach(const T (&array)[N], Fn&& fn)
+CMT_INTRIN void cforeach(const T (&array)[N], Fn&& fn)
 {
     for (size_t i = 0; i < N; i++)
     {
@@ -1188,38 +1204,38 @@ CID_INTRIN void cforeach(const T (&array)[N], Fn&& fn)
 namespace details
 {
 template <typename... Ts, typename Fn, size_t... indices>
-CID_INTRIN void cforeach_tuple_impl(const std::tuple<Ts...>& tuple, Fn&& fn, csizes_t<indices...>)
+CMT_INTRIN void cforeach_tuple_impl(const std::tuple<Ts...>& tuple, Fn&& fn, csizes_t<indices...>)
 {
     swallow{ (fn(std::get<indices>(tuple)), void(), 0)... };
 }
 template <typename T0, typename... types, typename Fn, size_t... indices>
-CID_INTRIN void cforeach_types_impl(ctypes_t<T0, types...>, Fn&& fn, csizes_t<indices...>)
+CMT_INTRIN void cforeach_types_impl(ctypes_t<T0, types...>, Fn&& fn, csizes_t<indices...>)
 {
     swallow{ (fn(ctype<type_of<details::get_nth_type<indices, T0, types...>>>), void(), 0)... };
 }
 }
 
 template <typename... Ts, typename Fn>
-CID_INTRIN void cforeach(ctypes_t<Ts...> types, Fn&& fn)
+CMT_INTRIN void cforeach(ctypes_t<Ts...> types, Fn&& fn)
 {
     details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq<sizeof...(Ts)>);
 }
 
 template <typename... Ts, typename Fn>
-CID_INTRIN void cforeach(const std::tuple<Ts...>& tuple, Fn&& fn)
+CMT_INTRIN void cforeach(const std::tuple<Ts...>& tuple, Fn&& fn)
 {
     details::cforeach_tuple_impl(tuple, std::forward<Fn>(fn), csizeseq<sizeof...(Ts)>);
 }
 
 template <typename A0, typename A1, typename Fn>
-CID_INTRIN void cforeach(A0&& a0, A1&& a1, Fn&& fn)
+CMT_INTRIN void cforeach(A0&& a0, A1&& a1, Fn&& fn)
 {
     cforeach(std::forward<A0>(a0),
              [&](auto v0) { cforeach(std::forward<A1>(a1), [&](auto v1) { fn(v0, v1); }); });
 }
 
 template <typename A0, typename A1, typename A2, typename Fn>
-CID_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
+CMT_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
 {
     cforeach(std::forward<A0>(a0), [&](auto v0) {
         cforeach(std::forward<A1>(a1),
@@ -1228,13 +1244,13 @@ CID_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
 }
 
 template <typename T, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal>
-CID_INTRIN decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(), CmpFn&& = CmpFn())
+CMT_INTRIN decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(), CmpFn&& = CmpFn())
 {
     return deffn();
 }
 
 template <typename T, T v0, T... values, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal>
-CID_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn,
+CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn,
                                   DefFn&& deffn = DefFn(), CmpFn&& cmpfn = CmpFn())
 {
     if (cmpfn(value, v0))
@@ -1249,19 +1265,19 @@ CID_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, 
 }
 
 template <typename TrueFn, typename FalseFn = fn_noop>
-CID_INTRIN decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn())
+CMT_INTRIN decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn())
 {
     return truefn(cbool<true>);
 }
 
 template <typename TrueFn, typename FalseFn = fn_noop>
-CID_INTRIN decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn())
+CMT_INTRIN decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn())
 {
     return falsefn(cbool<false>);
 }
 
 template <typename T, T start, T stop, typename BodyFn>
-CID_INTRIN decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn)
+CMT_INTRIN decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn)
 {
     return cforeach(cvalrange<T, start, stop>, std::forward<BodyFn>(bodyfn));
 }
@@ -1316,20 +1332,20 @@ struct virtual_function
 {
     virtual Result operator()(Args... args)     = 0;
     virtual virtual_function* make_copy() const = 0;
-    CID_INTRIN virtual ~virtual_function()      = default;
+    CMT_INTRIN virtual ~virtual_function()      = default;
 };
 
 template <typename Fn, typename Result, typename... Args>
 struct virtual_function_impl : virtual_function<Result, Args...>
 {
 public:
-    CID_INTRIN virtual_function_impl(const Fn& fn) : fn(fn) {}
-    CID_INTRIN Result operator()(Args... args) override final { return fn(args...); }
-    CID_INTRIN virtual_function<Result, Args...>* make_copy() const override final
+    CMT_INTRIN virtual_function_impl(const Fn& fn) : fn(fn) {}
+    CMT_INTRIN Result operator()(Args... args) override final { return fn(args...); }
+    CMT_INTRIN virtual_function<Result, Args...>* make_copy() const override final
     {
         return new virtual_function_impl{ fn };
     }
-    CID_INTRIN ~virtual_function_impl() {}
+    CMT_INTRIN ~virtual_function_impl() {}
 
 private:
     Fn fn;
@@ -1347,13 +1363,13 @@ struct func_filter<Result(Args...)>
 };
 
 template <typename T>
-constexpr CID_INTRIN T return_val() noexcept
+constexpr CMT_INTRIN T return_val() noexcept
 {
     return {};
 }
 
 template <>
-constexpr CID_INTRIN void return_val<void>() noexcept
+constexpr CMT_INTRIN void return_val<void>() noexcept
 {
 }
 }
@@ -1381,16 +1397,16 @@ struct function<Result(Args...)>
         return *this;
     }
 
-    CID_INTRIN function() : fn(nullptr) {}
-    CID_INTRIN function(std::nullptr_t) : fn(nullptr) {}
+    CMT_INTRIN function() : fn(nullptr) {}
+    CMT_INTRIN function(std::nullptr_t) : fn(nullptr) {}
     template <typename Func>
-    CID_INTRIN function(const Func& x)
+    CMT_INTRIN function(const Func& x)
         : fn(new details::virtual_function_impl<typename details::func_filter<Func>::type, Result, Args...>(
               x))
     {
     }
     function(const this_t& other) : fn(other.fn ? other.fn->make_copy() : nullptr) {}
-    CID_INTRIN function& operator=(const this_t& other)
+    CMT_INTRIN function& operator=(const this_t& other)
     {
         if ((&other != this) && (other.fn))
         {
@@ -1400,14 +1416,14 @@ struct function<Result(Args...)>
         }
         return *this;
     }
-    CID_INTRIN function& operator=(std::nullptr_t)
+    CMT_INTRIN function& operator=(std::nullptr_t)
     {
         delete fn;
         fn = nullptr;
         return *this;
     }
     template <typename Fn>
-    CID_INTRIN function& operator=(const Fn& x)
+    CMT_INTRIN function& operator=(const Fn& x)
     {
         using FnImpl =
             details::virtual_function_impl<typename details::func_filter<Fn>::type, Result, Args...>;
@@ -1416,24 +1432,24 @@ struct function<Result(Args...)>
         fn = temp;
         return *this;
     }
-    CID_INTRIN Result operator()(Args... args) const
+    CMT_INTRIN Result operator()(Args... args) const
     {
         if (fn)
             return (*fn)(args...);
         else
             return details::return_val<Result>();
     }
-    CID_INTRIN explicit operator bool() const noexcept { return !!fn; }
+    CMT_INTRIN explicit operator bool() const noexcept { return !!fn; }
 
-    CID_INTRIN ~function() { delete fn; }
+    CMT_INTRIN ~function() { delete fn; }
 private:
     details::virtual_function<Result, Args...>* fn;
 };
 
 template <typename Ret, typename... Args, typename T, typename Fn, typename DefFn = fn_noop>
-CID_INLINE function<Ret(Args...)> cdispatch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn())
+CMT_INLINE function<Ret(Args...)> cdispatch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn())
 {
-    return [=](Args... args) CID_INLINE_MEMBER -> Ret { return deffn(std::forward<Args>(args)...); };
+    return [=](Args... args) CMT_INLINE_MEMBER -> Ret { return deffn(std::forward<Args>(args)...); };
 }
 
 template <typename Ret, typename... Args, typename T, T v0, T... values, typename Fn,
@@ -1444,7 +1460,7 @@ inline function<Ret(Args...)> cdispatch(cvals_t<T, v0, values...>, identity<T> v
     if (value == v0)
     {
         return [=](Args... args)
-                   CID_INLINE_MEMBER -> Ret { return fn(cval<T, v0>, std::forward<Args>(args)...); };
+                   CMT_INLINE_MEMBER -> Ret { return fn(cval<T, v0>, std::forward<Args>(args)...); };
     }
     else
     {
@@ -1462,7 +1478,7 @@ inline size_t cfind(cvals_t<T, values...>, identity<T> value)
 }
 
 template <typename Fn, typename... Args>
-CID_NOINLINE static result_of<Fn(Args...)> noinline(Fn&& fn, Args&&... args)
+CMT_NOINLINE static result_of<Fn(Args...)> noinline(Fn&& fn, Args&&... args)
 {
     return fn(std::forward<Args>(args)...);
 }
@@ -1471,7 +1487,7 @@ template <typename Fn>
 struct fn_noinline
 {
     template <typename... Args>
-    CID_INTRIN result_of<Fn(Args...)> operator()(Args&&... args) const
+    CMT_INTRIN result_of<Fn(Args...)> operator()(Args&&... args) const
     {
         return noinline(Fn{}, std::forward<Args>(args)...);
     }
@@ -1479,7 +1495,7 @@ struct fn_noinline
 
 template <typename... Args, typename Fn, typename Ret = decltype(std::declval<Fn>()(std::declval<Args>()...)),
           typename NonMemFn = Ret (*)(Fn*, Args...)>
-CID_INTRIN NonMemFn make_nonmember(const Fn&)
+CMT_INTRIN NonMemFn make_nonmember(const Fn&)
 {
     return [](Fn* fn, Args... args) -> Ret { return fn->operator()(std::forward<Args>(args)...); };
 }
@@ -1515,9 +1531,9 @@ inline const char* type_name() noexcept
 {
     constexpr size_t prefix  = details::strlen("const char *cometa::type_name() [T = ");
     constexpr size_t postfix = details::strlen("]");
-    constexpr size_t length  = sizeof(CID_FUNC_SIGNATURE) - 1 - prefix - postfix;
+    constexpr size_t length  = sizeof(CMT_FUNC_SIGNATURE) - 1 - prefix - postfix;
     static const std::array<char, length + 1> name =
-        details::gettypename_impl(CID_FUNC_SIGNATURE + prefix, csizeseq<length>);
+        details::gettypename_impl(CMT_FUNC_SIGNATURE + prefix, csizeseq<length>);
     return name.data();
 }
 
@@ -1728,14 +1744,14 @@ struct autocast_impl
 {
     const Tfrom value;
     template <typename T>
-    CID_INTRIN constexpr operator T() const noexcept
+    CMT_INTRIN constexpr operator T() const noexcept
     {
         return static_cast<T>(value);
     }
 };
 
 template <typename Tfrom>
-CID_INTRIN constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) noexcept
+CMT_INTRIN constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) noexcept
 {
     return { value };
 }
diff --git a/include/kfr/cometa/string.hpp b/include/kfr/cometa/string.hpp
@@ -7,7 +7,7 @@
 #include <utility>
 
 #pragma clang diagnostic push
-#if CID_HAS_WARNING("-Wformat-security")
+#if CMT_HAS_WARNING("-Wformat-security")
 #pragma clang diagnostic ignored "-Wformat-security"
 #pragma clang diagnostic ignored "-Wused-but-marked-unused"
 #endif
@@ -16,7 +16,7 @@ namespace cometa
 {
 
 template <typename... Args>
-CID_INLINE std::string as_string(const Args&... args);
+CMT_INLINE std::string as_string(const Args&... args);
 
 template <typename T>
 constexpr inline const T& repr(const T& value)
@@ -46,13 +46,13 @@ namespace details
 {
 
 template <size_t N, size_t... indices>
-CID_INLINE constexpr cstring<N> make_cstring_impl(const char (&str)[N], csizes_t<indices...>)
+CMT_INLINE constexpr cstring<N> make_cstring_impl(const char (&str)[N], csizes_t<indices...>)
 {
     return { { str[indices]..., 0 } };
 }
 
 template <size_t N1, size_t N2, size_t... indices>
-CID_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<N1>& str1,
+CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<N1>& str1,
                                                                   const cstring<N2>& str2,
                                                                   csizes_t<indices...>)
 {
@@ -60,7 +60,7 @@ CID_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<
     return { { (indices < L1 ? str1[indices] : str2[indices - L1])..., 0 } };
 }
 template <size_t N1, size_t N2, typename... Args>
-CID_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<N1>& str1,
+CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<N1>& str1,
                                                                   const cstring<N2>& str2)
 {
     return concat_str_impl(str1, str2, csizeseq<N1 - 1 + N2 - 1>);
@@ -77,29 +77,29 @@ cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str, c
 }
 }
 
-CID_INLINE constexpr cstring<1> concat_cstring() { return { { 0 } }; }
+CMT_INLINE constexpr cstring<1> concat_cstring() { return { { 0 } }; }
 
 template <size_t N1>
-CID_INLINE constexpr cstring<N1> concat_cstring(const cstring<N1>& str1)
+CMT_INLINE constexpr cstring<N1> concat_cstring(const cstring<N1>& str1)
 {
     return str1;
 }
 
 template <size_t N1, size_t N2, typename... Args>
-CID_INLINE constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2,
+CMT_INLINE constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2,
                                          const Args&... args)
 {
     return details::concat_str_impl(str1, concat_cstring(str2, args...));
 }
 
 template <size_t N>
-CID_INLINE constexpr cstring<N> make_cstring(const char (&str)[N])
+CMT_INLINE constexpr cstring<N> make_cstring(const char (&str)[N])
 {
     return details::make_cstring_impl(str, csizeseq<N - 1>);
 }
 
 template <char... chars>
-CID_INLINE constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>)
+CMT_INLINE constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>)
 {
     return { { chars..., 0 } };
 }
@@ -152,99 +152,99 @@ constexpr auto itoa()
 }
 
 template <typename T, char t, int width, int prec, CMT_ENABLE_IF(width < 0 && prec >= 0)>
-CID_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
+CMT_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
 {
     return concat_cstring(make_cstring("."), itoa<prec>());
 }
 template <typename T, char t, int width, int prec, CMT_ENABLE_IF(width >= 0 && prec < 0)>
-CID_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
+CMT_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
 {
     return itoa<width>();
 }
 template <typename T, char t, int width, int prec, CMT_ENABLE_IF(width < 0 && prec < 0)>
-CID_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
+CMT_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
 {
     return make_cstring("");
 }
 template <typename T, char t, int width, int prec, CMT_ENABLE_IF(width >= 0 && prec >= 0)>
-CID_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
+CMT_INLINE constexpr auto value_fmt_arg(ctype_t<fmt_t<T, t, width, prec>>)
 {
     return concat_cstring(itoa<width>(), make_cstring("."), itoa<prec>());
 }
 
-CID_INLINE constexpr auto value_fmt(ctype_t<bool>) { return make_cstring("s"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<std::string>) { return make_cstring("s"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<char>) { return make_cstring("d"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<signed char>) { return make_cstring("d"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<unsigned char>) { return make_cstring("d"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<short>) { return make_cstring("d"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<unsigned short>) { return make_cstring("d"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<int>) { return make_cstring("d"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<long>) { return make_cstring("ld"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<long long>) { return make_cstring("lld"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<unsigned int>) { return make_cstring("u"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<unsigned long>) { return make_cstring("lu"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<unsigned long long>) { return make_cstring("llu"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<float>) { return make_cstring("g"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<double>) { return make_cstring("g"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<long double>) { return make_cstring("Lg"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<const char*>) { return make_cstring("s"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<char*>) { return make_cstring("s"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<void*>) { return make_cstring("p"); }
-CID_INLINE constexpr auto value_fmt(ctype_t<const void*>) { return make_cstring("p"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<bool>) { return make_cstring("s"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<std::string>) { return make_cstring("s"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<char>) { return make_cstring("d"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<signed char>) { return make_cstring("d"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<unsigned char>) { return make_cstring("d"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<short>) { return make_cstring("d"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<unsigned short>) { return make_cstring("d"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<int>) { return make_cstring("d"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<long>) { return make_cstring("ld"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<long long>) { return make_cstring("lld"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<unsigned int>) { return make_cstring("u"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<unsigned long>) { return make_cstring("lu"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<unsigned long long>) { return make_cstring("llu"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<float>) { return make_cstring("g"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<double>) { return make_cstring("g"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<long double>) { return make_cstring("Lg"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<const char*>) { return make_cstring("s"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<char*>) { return make_cstring("s"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<void*>) { return make_cstring("p"); }
+CMT_INLINE constexpr auto value_fmt(ctype_t<const void*>) { return make_cstring("p"); }
 
 template <char... chars>
-CID_INLINE constexpr auto value_fmt(ctype_t<cchars_t<chars...>>)
+CMT_INLINE constexpr auto value_fmt(ctype_t<cchars_t<chars...>>)
 {
     return concat_cstring(make_cstring("s"), make_cstring(cchars<chars...>));
 }
 
 template <typename T>
-CID_INLINE constexpr auto value_fmt(ctype_t<ctype_t<T>>)
+CMT_INLINE constexpr auto value_fmt(ctype_t<ctype_t<T>>)
 {
     return make_cstring("s");
 }
 
 template <typename T, int width, int prec>
-CID_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, static_cast<char>(-1), width, prec>> fmt)
+CMT_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, static_cast<char>(-1), width, prec>> fmt)
 {
     return concat_cstring(value_fmt_arg(fmt), value_fmt(ctype<repr_type<T>>));
 }
 template <typename T, char t, int width, int prec>
-CID_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, t, width, prec>> fmt)
+CMT_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, t, width, prec>> fmt)
 {
     return concat_cstring(value_fmt_arg(fmt), cstring<2>{ { t, 0 } });
 }
 
 template <char... chars>
-CID_INLINE const char* pack_value(const cchars_t<chars...>&)
+CMT_INLINE const char* pack_value(const cchars_t<chars...>&)
 {
     return "";
 }
 
 template <typename Arg>
-CID_INLINE const Arg& pack_value(const Arg& value)
+CMT_INLINE const Arg& pack_value(const Arg& value)
 {
     return value;
 }
-CID_INLINE double pack_value(float value) { return static_cast<double>(value); }
-CID_INLINE auto pack_value(bool value) { return value ? "true" : "false"; }
-CID_INLINE auto pack_value(const std::string& value) { return value.c_str(); }
+CMT_INLINE double pack_value(float value) { return static_cast<double>(value); }
+CMT_INLINE auto pack_value(bool value) { return value ? "true" : "false"; }
+CMT_INLINE auto pack_value(const std::string& value) { return value.c_str(); }
 
 template <typename T>
-CID_INLINE const char* pack_value(ctype_t<T>)
+CMT_INLINE const char* pack_value(ctype_t<T>)
 {
     return type_name<T>();
 }
 
 template <typename T, char t, int width, int prec>
-CID_INLINE auto pack_value(const fmt_t<T, t, width, prec>& value)
+CMT_INLINE auto pack_value(const fmt_t<T, t, width, prec>& value)
 {
     return pack_value(repr(value.value));
 }
 
 template <size_t N1, size_t Nnew, size_t... indices>
-CID_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>& str,
+CMT_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>& str,
                                                              const cstring<Nnew>& newfmt,
                                                              csizes_t<indices...>)
 {
@@ -279,7 +279,7 @@ CID_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>& 
 }
 
 template <size_t N1, size_t Nto>
-CID_INLINE constexpr cstring<N1 - 3 + Nto> fmt_replace(const cstring<N1>& str, const cstring<Nto>& newfmt)
+CMT_INLINE constexpr cstring<N1 - 3 + Nto> fmt_replace(const cstring<N1>& str, const cstring<Nto>& newfmt)
 {
     return fmt_replace_impl(str, newfmt, csizeseq<N1 - 3 + Nto - 1>);
 }
@@ -295,10 +295,10 @@ inline std::string replace_one(const std::string& str, const std::string& from, 
     return r;
 }
 
-CID_INLINE const std::string& build_fmt(const std::string& str, ctypes_t<>) { return str; }
+CMT_INLINE const std::string& build_fmt(const std::string& str, ctypes_t<>) { return str; }
 
 template <typename Arg, typename... Args>
-CID_INLINE auto build_fmt(const std::string& str, ctypes_t<Arg, Args...>)
+CMT_INLINE auto build_fmt(const std::string& str, ctypes_t<Arg, Args...>)
 {
     constexpr auto fmt = value_fmt(ctype<decay<Arg>>);
     return build_fmt(replace_one(str, "{}", "%" + std::string(fmt.data())), ctypes<Args...>);
@@ -306,13 +306,13 @@ CID_INLINE auto build_fmt(const std::string& str, ctypes_t<Arg, Args...>)
 }
 
 template <char t, int width = -1, int prec = -1, typename T>
-CID_INLINE details::fmt_t<T, t, width, prec> fmt(const T& value)
+CMT_INLINE details::fmt_t<T, t, width, prec> fmt(const T& value)
 {
     return { value };
 }
 
 template <int width = -1, int prec = -1, typename T>
-CID_INLINE details::fmt_t<T, static_cast<char>(-1), width, prec> fmtwidth(const T& value)
+CMT_INLINE details::fmt_t<T, static_cast<char>(-1), width, prec> fmtwidth(const T& value)
 {
     return { value };
 }
@@ -358,7 +358,7 @@ template <char... chars>
 struct print_t
 {
     template <typename... Args>
-    CID_INLINE void operator()(const Args&... args)
+    CMT_INLINE void operator()(const Args&... args)
     {
         constexpr auto format_str = build_fmt_str(cchars<chars...>, ctypes<repr_type<Args>...>);
 
@@ -373,7 +373,7 @@ constexpr format_t<chars...> operator""_format()
 }
 
 template <typename Char, Char... chars>
-constexpr CID_INLINE print_t<chars...> operator""_print()
+constexpr CMT_INLINE print_t<chars...> operator""_print()
 {
     return {};
 }
@@ -381,28 +381,28 @@ constexpr CID_INLINE print_t<chars...> operator""_print()
 #pragma clang diagnostic pop
 
 template <typename... Args>
-CID_INLINE void printfmt(const std::string& fmt, const Args&... args)
+CMT_INLINE void printfmt(const std::string& fmt, const Args&... args)
 {
     const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
     std::printf(format_str.data(), details::pack_value(repr(args))...);
 }
 
 template <typename... Args>
-CID_INLINE void fprintfmt(FILE* f, const std::string& fmt, const Args&... args)
+CMT_INLINE void fprintfmt(FILE* f, const std::string& fmt, const Args&... args)
 {
     const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
     std::fprintf(f, format_str.data(), details::pack_value(repr(args))...);
 }
 
 template <typename... Args>
-CID_INLINE int snprintfmt(char* str, size_t size, const std::string& fmt, const Args&... args)
+CMT_INLINE int snprintfmt(char* str, size_t size, const std::string& fmt, const Args&... args)
 {
     const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
     return std::snprintf(str, size, format_str.data(), details::pack_value(repr(args))...);
 }
 
 template <typename... Args>
-CID_INLINE std::string format(const std::string& fmt, const Args&... args)
+CMT_INLINE std::string format(const std::string& fmt, const Args&... args)
 {
     std::string result;
     const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>);
@@ -416,7 +416,7 @@ CID_INLINE std::string format(const std::string& fmt, const Args&... args)
 }
 
 template <typename... Args>
-CID_INLINE void print(const Args&... args)
+CMT_INLINE void print(const Args&... args)
 {
     constexpr auto format_str = concat_cstring(
         concat_cstring(make_cstring("%"), details::value_fmt(ctype<decay<repr_type<Args>>>))...);
@@ -424,7 +424,7 @@ CID_INLINE void print(const Args&... args)
 }
 
 template <typename... Args>
-CID_INLINE void println(const Args&... args)
+CMT_INLINE void println(const Args&... args)
 {
     constexpr auto format_str = concat_cstring(
         concat_cstring(make_cstring("%"), details::value_fmt(ctype<decay<repr_type<Args>>>))...,
@@ -433,7 +433,7 @@ CID_INLINE void println(const Args&... args)
 }
 
 template <typename... Args>
-CID_INLINE std::string as_string(const Args&... args)
+CMT_INLINE std::string as_string(const Args&... args)
 {
     std::string result;
     constexpr auto format_str = concat_cstring(
diff --git a/include/kfr/dft.hpp b/include/kfr/dft.hpp
@@ -0,0 +1,31 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "base.hpp"
+
+#include "dft/bitrev.hpp"
+#include "dft/conv.hpp"
+#include "dft/fft.hpp"
+#include "dft/ft.hpp"
+#include "dft/reference_dft.hpp"
diff --git a/include/kfr/dft/bitrev.hpp b/include/kfr/dft/bitrev.hpp
@@ -85,7 +85,7 @@ KFR_INTRIN void fft_reorder_swap(T* inout, size_t i)
 template <size_t log2n, size_t bitrev, typename T>
 KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j)
 {
-    __builtin_assume(i != j);
+    CMT_ASSUME(i != j);
     using cxx           = cvec<T, 16>;
     constexpr size_t N  = 1 << log2n;
     constexpr size_t N4 = 2 * N / 4;
@@ -102,7 +102,7 @@ KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j)
 template <size_t log2n, size_t bitrev, typename T>
 KFR_INTRIN void fft_reorder_swap(T* inout, size_t i, size_t j)
 {
-    __builtin_assume(i != j);
+    CMT_ASSUME(i != j);
     using cxx           = cvec<T, 16>;
     constexpr size_t N  = 1 << log2n;
     constexpr size_t N4 = 2 * N / 4;
@@ -259,7 +259,7 @@ void cwrite_reordered(T* out, cvec<T, 16> value, size_t N4, cbool_t<use_br2>)
 template <typename T, bool use_br2>
 KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>)
 {
-    __builtin_assume(i != j);
+    CMT_ASSUME(i != j);
     const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4);
     const cvec<T, 16> vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4);
     cwrite_reordered(inout + j, vi, N4, cbool<use_br2>);
diff --git a/include/kfr/dft/conv.hpp b/include/kfr/dft/conv.hpp
@@ -31,7 +31,7 @@
 #include "fft.hpp"
 
 #pragma clang diagnostic push
-#if CID_HAS_WARNING("-Wshadow")
+#if CMT_HAS_WARNING("-Wshadow")
 #pragma clang diagnostic ignored "-Wshadow"
 #endif
 
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -35,7 +35,7 @@
 #include "ft.hpp"
 
 #pragma clang diagnostic push
-#if CID_HAS_WARNING("-Wshadow")
+#if CMT_HAS_WARNING("-Wshadow")
 #pragma clang diagnostic ignored "-Wshadow"
 #endif
 
@@ -65,7 +65,7 @@ protected:
 };
 
 #pragma clang diagnostic push
-#if CID_HAS_WARNING("-Wassume")
+#if CMT_HAS_WARNING("-Wassume")
 #pragma clang diagnostic ignored "-Wassume"
 #endif
 
@@ -194,7 +194,7 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout
 }
 
 template <typename T>
-KFR_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
+CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
 {
     if (n == 0)
     {
@@ -214,9 +214,9 @@ KFR_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
     }
     else
     {
-        double kth  = c_pi<double, 2> * (n / static_cast<double>(size));
-        double tcos = +kfr::cos(kth);
-        double tsin = -kfr::sin(kth);
+        fbase kth  = c_pi<fbase, 2> * (n / static_cast<fbase>(size));
+        fbase tcos = +kfr::cos(kth);
+        fbase tsin = -kfr::sin(kth);
         return make_vector(static_cast<T>(tcos), static_cast<T>(tsin));
     }
 }
@@ -226,7 +226,7 @@ KFR_SINTRIN void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_
                                           bool split_format)
 {
     vec<T, 2 * width> result = T();
-    KFR_LOOP_UNROLL
+    CMT_LOOP_UNROLL
     for (size_t i = 0; i < width; i++)
     {
         const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size);
@@ -241,10 +241,10 @@ KFR_SINTRIN void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_
 }
 
 template <typename T, size_t width>
-KFR_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
+CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
 {
     size_t nnstep = size / stage_size;
-    KFR_LOOP_NOUNROLL
+    CMT_LOOP_NOUNROLL
     for (size_t n = 0; n < stage_size / 4; n += width)
     {
         initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format);
@@ -256,7 +256,7 @@ KFR_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s
 template <typename T>
 KFR_SINTRIN void prefetch_one(const complex<T>* in)
 {
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
     __builtin_prefetch(ptr_cast<void>(in), 0, _MM_HINT_T0);
 #else
     __builtin_prefetch(ptr_cast<void>(in));
@@ -266,7 +266,7 @@ KFR_SINTRIN void prefetch_one(const complex<T>* in)
 template <typename T>
 KFR_SINTRIN void prefetch_four(size_t stride, const complex<T>* in)
 {
-#ifdef CID_ARCH_X86
+#ifdef CMT_ARCH_X86
     __builtin_prefetch(ptr_cast<void>(in), 0, _MM_HINT_T0);
     __builtin_prefetch(ptr_cast<void>(in + stride), 0, _MM_HINT_T0);
     __builtin_prefetch(ptr_cast<void>(in + stride * 2), 0, _MM_HINT_T0);
@@ -288,12 +288,12 @@ KFR_SINTRIN cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t
     constexpr static size_t prefetch_offset = width * 8;
     const auto N4                           = N / csize<4>;
     const auto N43                          = N4 * csize<3>;
-    __builtin_assume(blocks > 0);
-    __builtin_assume(N > 0);
-    __builtin_assume(N4 > 0);
-    KFR_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
+    CMT_ASSUME(blocks > 0);
+    CMT_ASSUME(N > 0);
+    CMT_ASSUME(N4 > 0);
+    CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
     {
-#pragma clang loop unroll_count(default_unroll_count)
+#pragma clang loop unroll_count(2)
         for (size_t n2 = 0; n2 < N4; n2 += width)
         {
             if (prefetch)
@@ -315,7 +315,7 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfal
                                 cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
                                 complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
 {
-    __builtin_assume(blocks > 0);
+    CMT_ASSUME(blocks > 0);
     constexpr static size_t prefetch_offset = 32 * 4;
     for (size_t b = 0; b < blocks; b++)
     {
@@ -352,7 +352,7 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfals
                                 cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
                                 complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
 {
-    __builtin_assume(blocks > 0);
+    CMT_ASSUME(blocks > 0);
     constexpr static size_t prefetch_offset = width * 16;
     for (size_t b = 0; b < blocks; b += 2)
     {
@@ -377,7 +377,7 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfal
                                 cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
                                 complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
 {
-    __builtin_assume(blocks > 0);
+    CMT_ASSUME(blocks > 0);
     constexpr static size_t prefetch_offset = width * 4;
 #pragma clang loop unroll_count(2)
     for (size_t b = 0; b < blocks; b += 2)
@@ -409,8 +409,8 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfals
                                 complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
 {
     constexpr static size_t prefetch_offset = width * 4;
-    __builtin_assume(blocks > 0);
-    KFR_LOOP_NOUNROLL
+    CMT_ASSUME(blocks > 0);
+    CMT_LOOP_NOUNROLL
     for (size_t b = 0; b < blocks; b += 4)
     {
         if (prefetch)
@@ -453,8 +453,8 @@ protected:
         if (splitin)
             in                  = out;
         const size_t stage_size = this->stage_size;
-        __builtin_assume(stage_size >= 2048);
-        __builtin_assume(stage_size % 2048 == 0);
+        CMT_ASSUME(stage_size >= 2048);
+        CMT_ASSUME(stage_size % 2048 == 0);
         radix4_pass(stage_size, 1, csize<width>, ctrue, cbool<splitin>, cbool<!is_even>, cbool<prefetch>,
                     cbool<inverse>, cbool<aligned>, out, in, twiddle);
     }
@@ -836,14 +836,14 @@ struct dft_plan
             const size_t log2n = ilog2(size);
             cswitch(csizes<1, 2, 3, 4, 5, 6, 7, 8>, log2n,
                     [&](auto log2n) {
-                        add_stage<internal::fft_specialization_t<T, val_of(log2n), false>::template type>(
-                            size, type);
+                        add_stage<internal::fft_specialization_t<T, val_of(decltype(log2n)()),
+                                                                 false>::template type>(size, type);
                     },
                     [&]() {
                         cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
                             make_fft(size, type, is_even, ctrue);
-                            add_stage<internal::fft_reorder_stage_impl_t<T, val_of(is_even)>::template type>(
-                                size, type);
+                            add_stage<internal::fft_reorder_stage_impl_t<
+                                T, val_of(decltype(is_even)())>::template type>(size, type);
                         });
                     });
             initialize(type);
diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp
@@ -41,18 +41,18 @@ namespace internal
 {
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, N> y)
 {
     return subadd(x * dupeven(y), swap<2>(x) * dupodd(y));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-KFR_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, 2> y)
+CMT_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, 2> y)
 {
     vec<T, N> yy = resize<N>(y);
     return cmul_impl(x, yy);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-KFR_INLINE vec<T, N> cmul_impl(vec<T, 2> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_impl(vec<T, 2> x, vec<T, N> y)
 {
     vec<T, N> xx = resize<N>(x);
     return cmul_impl(xx, y);
@@ -60,24 +60,24 @@ KFR_INLINE vec<T, N> cmul_impl(vec<T, 2> x, vec<T, N> y)
 
 /// Complex Multiplication
 template <typename T, size_t N1, size_t N2>
-KFR_INLINE vec<T, std::max(N1, N2)> cmul(vec<T, N1> x, vec<T, N2> y)
+CMT_INLINE vec<T, const_max(N1, N2)> cmul(vec<T, N1> x, vec<T, N2> y)
 {
     return internal::cmul_impl(x, y);
 }
 KFR_FN(cmul)
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, N> y)
 {
     return swap<2>(subadd(swap<2>(x) * cdupreal(y), x * cdupimag(y)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE vec<T, N> cmul_2conj(vec<T, N> in0, vec<T, N> in1, vec<T, N> tw)
+CMT_INLINE vec<T, N> cmul_2conj(vec<T, N> in0, vec<T, N> in1, vec<T, N> tw)
 {
     return (in0 + in1) * cdupreal(tw) + swap<2>(cnegimag(in0 - in1)) * cdupimag(tw);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, vec<T, 2> in0, vec<T, 2> in1, vec<T, N> tw)
+CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, vec<T, 2> in0, vec<T, 2> in1, vec<T, N> tw)
 {
     const vec<T, N> twr   = cdupreal(tw);
     const vec<T, N> twi   = cdupimag(tw);
@@ -89,13 +89,13 @@ KFR_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, vec<T, 2> in0, vec<
     out1 += sumtw - diftw;
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-KFR_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, 2> y)
+CMT_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, 2> y)
 {
     vec<T, N> yy = resize<N>(y);
     return cmul_conj(x, yy);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-KFR_INLINE vec<T, N> cmul_conj(vec<T, 2> x, vec<T, N> y)
+CMT_INLINE vec<T, N> cmul_conj(vec<T, 2> x, vec<T, N> y)
 {
     vec<T, N> xx = resize<N>(x);
     return cmul_conj(xx, y);
@@ -103,67 +103,70 @@ KFR_INLINE vec<T, N> cmul_conj(vec<T, 2> x, vec<T, N> y)
 KFR_FN(cmul_conj)
 KFR_FN(cmul_2conj)
 
+template <typename T, size_t N>
+using cvec = vec<T, N * 2>;
+
 template <size_t N, bool A = false, typename T>
-KFR_INLINE cvec<T, N> cread(const complex<T>* src)
+CMT_INLINE cvec<T, N> cread(const complex<T>* src)
 {
     return internal_read_write::read<N * 2, A>(ptr_cast<T>(src));
 }
 
 template <size_t N, bool A = false, typename T>
-KFR_INLINE void cwrite(complex<T>* dest, cvec<T, N> value)
+CMT_INLINE void cwrite(complex<T>* dest, cvec<T, N> value)
 {
     return internal_read_write::write<A>(ptr_cast<T>(dest), value);
 }
 
 template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
-KFR_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
+CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
 {
     return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
 }
 template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
-KFR_INLINE void cwrite_group_impl(complex<T>* dest, cvec<T, count * N> value, csizes_t<indices...>)
+CMT_INLINE void cwrite_group_impl(complex<T>* dest, cvec<T, count * N> value, csizes_t<indices...>)
 {
     swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
 }
 
 template <size_t count, size_t N, bool A, typename T, size_t... indices>
-KFR_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
+CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
 {
     return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
 }
 template <size_t count, size_t N, bool A, typename T, size_t... indices>
-KFR_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, cvec<T, count * N> value,
+CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, cvec<T, count * N> value,
                                   csizes_t<indices...>)
 {
     swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
 }
 
 template <size_t count, size_t N, size_t stride, bool A = false, typename T>
-KFR_INLINE cvec<T, count * N> cread_group(const complex<T>* src)
+CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src)
 {
     return cread_group_impl<count, N, stride, A>(src, csizeseq<count>);
 }
 
 template <size_t count, size_t N, size_t stride, bool A = false, typename T>
-KFR_INLINE void cwrite_group(complex<T>* dest, cvec<T, count * N> value)
+CMT_INLINE void cwrite_group(complex<T>* dest, cvec<T, count * N> value)
 {
     return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq<count>);
 }
 
 template <size_t count, size_t N, bool A = false, typename T>
-KFR_INLINE cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
+CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
 {
     return cread_group_impl<count, N, A>(src, stride, csizeseq<count>);
 }
 
 template <size_t count, size_t N, bool A = false, typename T>
-KFR_INLINE void cwrite_group(complex<T>* dest, size_t stride, cvec<T, count * N> value)
+CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, cvec<T, count * N> value)
 {
     return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq<count>);
 }
 
 template <size_t N, bool A = false, bool split = false, typename T>
-KFR_INLINE cvec<T, N> cread_split(const complex<T>* src)
+CMT_INLINE cvec<T, N> cread_split(const complex<T>* src)
 {
     cvec<T, N> temp = internal_read_write::read<N * 2, A>(ptr_cast<T>(src));
     if (split)
@@ -172,7 +175,7 @@ KFR_INLINE cvec<T, N> cread_split(const complex<T>* src)
 }
 
 template <size_t N, bool A = false, bool split = false, typename T>
-KFR_INLINE void cwrite_split(complex<T>* dest, cvec<T, N> value)
+CMT_INLINE void cwrite_split(complex<T>* dest, cvec<T, N> value)
 {
     if (split)
         value = interleavehalfs(value);
@@ -250,13 +253,13 @@ inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, cvec<f64, 4> x)
 }
 
 template <size_t N, size_t stride, typename T, size_t... Indices>
-KFR_INLINE cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>)
+CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>)
 {
     return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...);
 }
 
 template <size_t N, size_t stride, typename T>
-KFR_INLINE cvec<T, N> cgather(const complex<T>* base)
+CMT_INLINE cvec<T, N> cgather(const complex<T>* base)
 {
     if (stride == 1)
     {
@@ -266,7 +269,7 @@ KFR_INLINE cvec<T, N> cgather(const complex<T>* base)
         return cgather_helper<N, stride, T>(base, csizeseq<N>);
 }
 
-KFR_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
+CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
 {
     size_t temp = index;
     index += stride;
@@ -274,7 +277,7 @@ KFR_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t
         index -= size;
     return temp;
 }
-KFR_INLINE size_t cgather_next(size_t& index, size_t stride, size_t)
+CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t)
 {
     size_t temp = index;
     index += stride;
@@ -282,45 +285,45 @@ KFR_INLINE size_t cgather_next(size_t& index, size_t stride, size_t)
 }
 
 template <size_t N, typename T, size_t... Indices>
-KFR_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride,
+CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride,
                                      csizes_t<Indices...>)
 {
     return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...);
 }
 
 template <size_t N, typename T>
-KFR_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
+CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
 {
     return cgather_helper<N, T>(base, index, stride, csizeseq<N>);
 }
 template <size_t N, typename T>
-KFR_INLINE cvec<T, N> cgather(const complex<T>* base, size_t stride)
+CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t stride)
 {
     size_t index = 0;
     return cgather_helper<N, T>(base, index, stride, csizeseq<N>);
 }
 
 template <size_t N, typename T, size_t... Indices>
-KFR_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size,
+CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size,
                                      csizes_t<Indices...>)
 {
     return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...);
 }
 
 template <size_t N, typename T>
-KFR_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
+CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
 {
     return cgather_helper<N, T>(base, index, stride, size, csizeseq<N>);
 }
 
 template <size_t N, size_t stride, typename T, size_t... Indices>
-KFR_INLINE void cscatter_helper(complex<T>* base, cvec<T, N> value, csizes_t<Indices...>)
+CMT_INLINE void cscatter_helper(complex<T>* base, cvec<T, N> value, csizes_t<Indices...>)
 {
     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
 }
 
 template <size_t N, size_t stride, typename T>
-KFR_INLINE void cscatter(complex<T>* base, cvec<T, N> value)
+CMT_INLINE void cscatter(complex<T>* base, cvec<T, N> value)
 {
     if (stride == 1)
     {
@@ -333,31 +336,29 @@ KFR_INLINE void cscatter(complex<T>* base, cvec<T, N> value)
 }
 
 template <size_t N, typename T, size_t... Indices>
-KFR_INLINE void cscatter_helper(complex<T>* base, size_t stride, cvec<T, N> value, csizes_t<Indices...>)
+CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, cvec<T, N> value, csizes_t<Indices...>)
 {
     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
 }
 
 template <size_t N, typename T>
-KFR_INLINE void cscatter(complex<T>* base, size_t stride, cvec<T, N> value)
+CMT_INLINE void cscatter(complex<T>* base, size_t stride, cvec<T, N> value)
 {
     return cscatter_helper<N, T>(base, stride, value, csizeseq<N>);
 }
 
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
-KFR_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, vec<IT, N> offset)
+CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, vec<IT, N> offset)
 {
     return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq<N>);
 }
 
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
-KFR_INLINE void cscatter(complex<T>* base, vec<IT, N> offset, vec<T, N * 2 * groupsize> value)
+CMT_INLINE void cscatter(complex<T>* base, vec<IT, N> offset, vec<T, N * 2 * groupsize> value)
 {
     return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq<N>);
 }
 
-constexpr size_t default_unroll_count = 2;
-
 template <typename T>
 KFR_INTRIN void transpose4x8(cvec<T, 8> z0, cvec<T, 8> z1, cvec<T, 8> z2, cvec<T, 8> z3, cvec<T, 4>& w0,
                              cvec<T, 4>& w1, cvec<T, 4>& w2, cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5,
@@ -438,15 +439,15 @@ constexpr KFR_INTRIN T chsign(T x)
 
 template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false,
           size_t... indices>
-constexpr KFR_INTRIN cvec<T, N> get_fixed_twiddle_helper(std::integer_sequence<size_t, indices...>)
+constexpr KFR_INTRIN cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>)
 {
     return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start)))
                                     : cos_using_table<T>(size, (indices / 2 * step + start)))...);
 }
 
 template <typename T, size_t width, size_t... indices>
-constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(std::integer_sequence<size_t, indices...>,
-                                                             size_t size, size_t start, size_t step)
+constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, size_t start,
+                                                             size_t step)
 {
     return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start)
                                     : cos_using_table<T>(size, indices / 2 * step + start))...);
@@ -455,14 +456,13 @@ constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(std::integer_sequen
 template <typename T, size_t width, size_t size, size_t start, size_t step, bool inverse = false>
 constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle()
 {
-    return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(
-        std::make_index_sequence<width * 2>());
+    return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq<width * 2>);
 }
 
 template <typename T, size_t width>
 constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle(size_t size, size_t start, size_t step = 0)
 {
-    return get_fixed_twiddle_helper<T, width>(std::make_index_sequence<width * 2>(), start, step, size);
+    return get_fixed_twiddle_helper<T, width>(csizeseq<width * 2>, start, step, size);
 }
 
 template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false>
@@ -480,7 +480,7 @@ constexpr cvec<T, N> twiddleimagmask()
 #pragma clang diagnostic pop
 
 template <typename T, size_t N>
-KFR_NOINLINE static vec<T, N> cossin_conj(vec<T, N> x)
+CMT_NOINLINE static vec<T, N> cossin_conj(vec<T, N> x)
 {
     return cconj(cossin(x));
 }
@@ -1277,9 +1277,8 @@ KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, vec<T, N> x, const complex<T>* twiddl
 
 // Non-final
 template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
-KFR_INTRIN void butterfly_helper(std::index_sequence<I...>, size_t i, csize_t<width>, csize_t<radix>,
-                                 cbool_t<inverse>, complex<T>* out, const complex<T>* in,
-                                 const complex<T>* tw, size_t stride)
+KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>,
+                                 complex<T>* out, const complex<T>* in, const complex<T>* tw, size_t stride)
 {
     carray<cvec<T, width>, radix> inout;
 
@@ -1294,8 +1293,8 @@ KFR_INTRIN void butterfly_helper(std::index_sequence<I...>, size_t i, csize_t<wi
 
 // Final
 template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
-KFR_INTRIN void butterfly_helper(std::index_sequence<I...>, size_t i, csize_t<width>, csize_t<radix>,
-                                 cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride)
+KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>,
+                                 complex<T>* out, const complex<T>* in, size_t stride)
 {
     carray<cvec<T, width>, radix> inout;
 
@@ -1310,8 +1309,7 @@ KFR_INTRIN void butterfly_helper(std::index_sequence<I...>, size_t i, csize_t<wi
 template <size_t width, size_t radix, typename... Args>
 KFR_INTRIN void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args)
 {
-    butterfly_helper(std::make_index_sequence<radix>(), i, csize<width>, csize<radix>,
-                     std::forward<Args>(args)...);
+    butterfly_helper(csizeseq<radix>, i, csize<width>, csize<radix>, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
@@ -1321,7 +1319,7 @@ KFR_INTRIN void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...)
 template <size_t width, typename... Args>
 KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args)
 {
-    KFR_LOOP_NOUNROLL
+    CMT_LOOP_NOUNROLL
     for (; i < count / width * width; i += width)
         butterfly(i, csize<width>, std::forward<Args>(args)...);
     butterfly_cycle(i, count, csize<width / 2>, std::forward<Args>(args)...);
@@ -1330,7 +1328,7 @@ KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&.
 template <size_t width, typename... Args>
 KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
 {
-    __builtin_assume(count > 0);
+    CMT_ASSUME(count > 0);
     size_t i = 0;
     butterfly_cycle(i, count, csize<width>, std::forward<Args>(args)...);
 }
@@ -1345,14 +1343,14 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<in
                                         const complex<T>* in, Tstride ostride, size_t halfradix,
                                         size_t halfradix_sqr, const complex<T>* twiddle, size_t i)
 {
-    KFR_LOOP_NOUNROLL
+    CMT_LOOP_NOUNROLL
     for (; i < halfradix / width * width; i += width)
     {
         const cvec<T, 1> in0 = cread<1>(in);
         cvec<T, width> sum0  = resize<2 * width>(in0);
         cvec<T, width> sum1  = sum0;
 
-        KFR_LOOP_NOUNROLL
+        CMT_LOOP_NOUNROLL
         for (size_t j = 0; j < halfradix; j++)
         {
             const cvec<T, 1> ina = cread<1>(in + (1 + j));
@@ -1386,17 +1384,17 @@ template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>>
 KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
                                     const complex<T>* twiddle, Tstride ostride = Tstride{})
 {
-    __builtin_assume(radix > 0);
+    CMT_ASSUME(radix > 0);
     {
         cvec<T, width> sum = T();
         size_t j = 0;
-        KFR_LOOP_NOUNROLL
+        CMT_LOOP_NOUNROLL
         for (; j < radix / width * width; j += width)
         {
             sum += cread<width>(in + j);
         }
         cvec<T, 1> sums = T();
-        KFR_LOOP_NOUNROLL
+        CMT_LOOP_NOUNROLL
         for (; j < radix; j++)
         {
             sums += cread<1>(in + j);
@@ -1405,7 +1403,7 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* 
     }
     const size_t halfradix     = radix / 2;
     const size_t halfradix_sqr = halfradix * halfradix;
-    __builtin_assume(halfradix > 0);
+    CMT_ASSUME(halfradix > 0);
     size_t i = 0;
 
     generic_butterfly_cycle(csize<width>, radix, cbool<inverse>, out, in, ostride, halfradix, halfradix_sqr,
@@ -1424,10 +1422,10 @@ KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* ou
     constexpr size_t width = vector_width<T, cpu_t::native>;
 
     cswitch(csizes<11>, radix,
-            [&](auto radix_) KFR_INLINE_LAMBDA {
-                generic_butterfly_w<width>(val_of(radix_), cbool<inverse>, out, in, twiddle, ostride);
+            [&](auto radix_) CMT_INLINE_LAMBDA {
+                generic_butterfly_w<width>(decltype(radix_)(), cbool<inverse>, out, in, twiddle, ostride);
             },
-            [&]() KFR_INLINE_LAMBDA {
+            [&]() CMT_INLINE_LAMBDA {
                 generic_butterfly_w<width>(radix, cbool<inverse>, out, in, twiddle, ostride);
             });
 }
diff --git a/include/kfr/dsp.hpp b/include/kfr/dsp.hpp
@@ -0,0 +1,43 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "base.hpp"
+
+#include "dsp/biquad.hpp"
+#include "dsp/biquad_design.hpp"
+#include "dsp/dcremove.hpp"
+#include "dsp/fir.hpp"
+#include "dsp/fir_design.hpp"
+#include "dsp/fracdelay.hpp"
+#include "dsp/goertzel.hpp"
+#include "dsp/impulse.hpp"
+#include "dsp/interpolation.hpp"
+#include "dsp/mixdown.hpp"
+#include "dsp/oscillators.hpp"
+#include "dsp/resample.hpp"
+#include "dsp/speaker.hpp"
+#include "dsp/units.hpp"
+#include "dsp/waveshaper.hpp"
+#include "dsp/weighting.hpp"
+#include "dsp/window.hpp"
diff --git a/include/kfr/dsp/biquad.hpp b/include/kfr/dsp/biquad.hpp
@@ -44,6 +44,15 @@ enum class biquad_type
 template <typename T>
 struct biquad_params
 {
+    template <typename U>
+    constexpr biquad_params(const biquad_params<U>& bq) noexcept : a0(static_cast<T>(bq.a0)),
+                                                                   a1(static_cast<T>(bq.a1)),
+                                                                   a2(static_cast<T>(bq.a2)),
+                                                                   b0(static_cast<T>(bq.b0)),
+                                                                   b1(static_cast<T>(bq.b1)),
+                                                                   b2(static_cast<T>(bq.b2))
+    {
+    }
     constexpr static bool is_pod = true;
 
     static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
@@ -74,99 +83,92 @@ struct biquad_params
 
 namespace internal
 {
-template <cpu_t cpu = cpu_t::native>
-struct in_biquad
+template <typename T, size_t filters, KFR_ARCH_DEP>
+struct biquad_block
 {
-private:
-public:
-    template <typename T, size_t filters>
-    struct biquad_block
-    {
-        vec<T, filters> s1;
-        vec<T, filters> s2;
-        vec<T, filters> a1;
-        vec<T, filters> a2;
-        vec<T, filters> b0;
-        vec<T, filters> b1;
-        vec<T, filters> b2;
+    vec<T, filters> s1;
+    vec<T, filters> s2;
+    vec<T, filters> a1;
+    vec<T, filters> a2;
+    vec<T, filters> b0;
+    vec<T, filters> b1;
+    vec<T, filters> b2;
 
-        vec<T, filters> out;
-        biquad_block() : s1(0), s2(0), a1(0), a2(0), b0(1), b1(0), b2(0), out(0) {}
-        biquad_block(const biquad_params<T>* bq, size_t count) : s1(0), s2(0), out(0)
+    vec<T, filters> out;
+    biquad_block() : s1(0), s2(0), a1(0), a2(0), b0(1), b1(0), b2(0), out(0) {}
+    biquad_block(const biquad_params<T>* bq, size_t count) : s1(0), s2(0), out(0)
+    {
+        count = count > filters ? filters : count;
+        for (size_t i = 0; i < count; i++)
         {
-            count = count > filters ? filters : count;
-            for (size_t i = 0; i < count; i++)
-            {
-                a1(i) = bq[i].a1;
-                a2(i) = bq[i].a2;
-                b0(i) = bq[i].b0;
-                b1(i) = bq[i].b1;
-                b2(i) = bq[i].b2;
-            }
-            for (size_t i = count; i < filters; i++)
-            {
-                a1(i) = T(0);
-                a2(i) = T(0);
-                b0(i) = T(1);
-                b1(i) = T(0);
-                b2(i) = T(0);
-            }
+            a1(i) = bq[i].a1;
+            a2(i) = bq[i].a2;
+            b0(i) = bq[i].b0;
+            b1(i) = bq[i].b1;
+            b2(i) = bq[i].b2;
         }
-
-        template <size_t count>
-        biquad_block(const biquad_params<T> (&bq)[count]) : biquad_block(bq, count)
+        for (size_t i = count; i < filters; i++)
         {
-            static_assert(count <= filters, "count > filters");
+            a1(i) = T(0);
+            a2(i) = T(0);
+            b0(i) = T(1);
+            b1(i) = T(0);
+            b2(i) = T(0);
         }
-    };
+    }
 
-    template <size_t filters, typename T, typename E1>
-    struct expression_biquads : public expression<E1>
+    template <size_t count>
+    biquad_block(const biquad_params<T> (&bq)[count]) : biquad_block(bq, count)
     {
-        using value_type = T;
+        static_assert(count <= filters, "count > filters");
+    }
+};
 
-        expression_biquads(const biquad_block<T, filters>& bq, E1&& e1)
-            : expression<E1>(std::forward<E1>(e1)), bq(bq)
-        {
-        }
-        template <size_t width>
-        KFR_INTRIN vec<T, width> operator()(cinput_t, size_t index, vec_t<T, width> t) const
-        {
-            const vec<T, width> in = this->argument_first(index, t);
-            vec<T, width> out;
+template <size_t filters, typename T, typename E1, KFR_ARCH_DEP>
+struct expression_biquads : public expression<E1>
+{
+    using value_type = T;
 
-            KFR_LOOP_UNROLL
-            for (size_t i = 0; i < width; i++)
-            {
-                bq.out = process(bq, insertleft(in[i], bq.out));
-                out(i) = bq.out[filters - 1];
-            }
+    expression_biquads(const biquad_block<T, filters>& bq, E1&& e1)
+        : expression<E1>(std::forward<E1>(e1)), bq(bq)
+    {
+    }
+    template <size_t width>
+    KFR_INTRIN vec<T, width> operator()(cinput_t, size_t index, vec_t<T, width> t) const
+    {
+        const vec<T, width> in = this->argument_first(index, t);
+        vec<T, width> out;
 
-            return out;
-        }
-        KFR_SINTRIN vec<T, filters> process(biquad_block<T, filters>& bq, vec<T, filters> in)
+        CMT_LOOP_UNROLL
+        for (size_t i = 0; i < width; i++)
         {
-            const vec<T, filters> out = bq.b0 * in + bq.s1;
-            bq.s1 = bq.s2 + bq.b1 * in - bq.a1 * out;
-            bq.s2 = bq.b2 * in - bq.a2 * out;
-            return out;
+            bq.out = process(bq, insertleft(in[i], bq.out));
+            out(i) = bq.out[filters - 1];
         }
-        mutable biquad_block<T, filters> bq;
-    };
+
+        return out;
+    }
+    KFR_SINTRIN vec<T, filters> process(biquad_block<T, filters>& bq, vec<T, filters> in)
+    {
+        const vec<T, filters> out = bq.b0 * in + bq.s1;
+        bq.s1 = bq.s2 + bq.b1 * in - bq.a1 * out;
+        bq.s2 = bq.b2 * in - bq.a2 * out;
+        return out;
+    }
+    mutable biquad_block<T, filters> bq;
 };
 }
 
 template <typename T, typename E1>
-KFR_INLINE internal::in_biquad<>::expression_biquads<1, T, internal::arg<E1>> biquad(
-    const biquad_params<T>& bq, E1&& e1)
+CMT_INLINE internal::expression_biquads<1, T, internal::arg<E1>> biquad(const biquad_params<T>& bq, E1&& e1)
 {
     const biquad_params<T> bqs[1] = { bq };
-    return internal::in_biquad<>::expression_biquads<1, T, internal::arg<E1>>(bqs, std::forward<E1>(e1));
+    return internal::expression_biquads<1, T, internal::arg<E1>>(bqs, std::forward<E1>(e1));
 }
 template <size_t filters, typename T, typename E1>
-KFR_INLINE internal::in_biquad<>::expression_biquads<filters, T, internal::arg<E1>> biquad(
+CMT_INLINE internal::expression_biquads<filters, T, internal::arg<E1>> biquad(
     const biquad_params<T> (&bq)[filters], E1&& e1)
 {
-    return internal::in_biquad<>::expression_biquads<filters, T, internal::arg<E1>>(bq, std::forward<E1>(e1));
+    return internal::expression_biquads<filters, T, internal::arg<E1>>(bq, std::forward<E1>(e1));
 }
 }
diff --git a/include/kfr/dsp/biquad_design.hpp b/include/kfr/dsp/biquad_design.hpp
@@ -28,8 +28,8 @@
 namespace kfr
 {
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_allpass(T frequency, T Q)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q)
 {
     const T alpha = std::sin(frequency) / 2.0 * Q;
     const T cs    = std::cos(frequency);
@@ -43,8 +43,8 @@ KFR_INLINE biquad_params<T> biquad_allpass(T frequency, T Q)
     return { b0, b1, b2, a0, a1, a2 };
 }
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_lowpass(T frequency, T Q)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -57,8 +57,8 @@ KFR_INLINE biquad_params<T> biquad_lowpass(T frequency, T Q)
     return { 1.0, b1, b2, a0, a1, a2 };
 }
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_highpass(T frequency, T Q)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -71,8 +71,8 @@ KFR_INLINE biquad_params<T> biquad_highpass(T frequency, T Q)
     return { 1.0, b1, b2, a0, a1, a2 };
 }
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_bandpass(T frequency, T Q)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -85,8 +85,8 @@ KFR_INLINE biquad_params<T> biquad_bandpass(T frequency, T Q)
     return { 1.0, b1, b2, a0, a1, a2 };
 }
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_notch(T frequency, T Q)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -99,8 +99,8 @@ KFR_INLINE biquad_params<T> biquad_notch(T frequency, T Q)
     return { 1.0, b1, b2, a0, a1, a2 };
 }
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_peak(T frequency, T Q, T gain)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> gain)
 {
     biquad_params<T> result;
     const T K  = std::tan(c_pi<T, 1> * frequency);
@@ -130,8 +130,8 @@ KFR_INLINE biquad_params<T> biquad_peak(T frequency, T Q, T gain)
     return result;
 }
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_lowshelf(T frequency, T gain)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain)
 {
     biquad_params<T> result;
     const T K  = std::tan(c_pi<T, 1> * frequency);
@@ -161,8 +161,8 @@ KFR_INLINE biquad_params<T> biquad_lowshelf(T frequency, T gain)
     return result;
 }
 
-template <typename T>
-KFR_INLINE biquad_params<T> biquad_highshelf(T frequency, T gain)
+template <typename T = fbase>
+CMT_INLINE biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain)
 {
     biquad_params<T> result;
     const T K  = std::tan(c_pi<T, 1> * frequency);
diff --git a/include/kfr/dsp/dcremove.hpp b/include/kfr/dsp/dcremove.hpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "biquad.hpp"
+#include "biquad_design.hpp"
+
+namespace kfr
+{
+
+template <typename T, typename E1>
+CMT_INLINE internal::expression_biquads<1, T, internal::arg<E1>> dcremove(E1&& e1, double cutoff = 0.00025)
+{
+    const biquad_params<T> bqs[1] = { biquad_highpass(cutoff, 0.5) };
+    return internal::expression_biquads<1, T, internal::arg<E1>>(bqs, std::forward<E1>(e1));
+}
+}
diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp
@@ -38,7 +38,7 @@ using fir_taps = univector<T, Size>;
 
 namespace internal
 {
-template <size_t tapcount, typename T, typename E1>
+template <size_t tapcount, typename T, typename E1, KFR_ARCH_DEP>
 struct expression_short_fir : expression<E1>
 {
     static_assert(is_poweroftwo(tapcount), "tapcount must be a power of two");
@@ -47,8 +47,12 @@ struct expression_short_fir : expression<E1>
         : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(0)
     {
     }
+    expression_short_fir(E1&& e1, const array_ref<const T>& taps)
+        : expression<E1>(std::forward<E1>(e1)), taps(taps), delayline(0)
+    {
+    }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
     {
         vec<T, N> in = cast<T>(this->argument_first(index, x));
 
@@ -63,7 +67,7 @@ struct expression_short_fir : expression<E1>
     mutable vec<T, tapcount - 1> delayline;
 };
 
-template <typename T, typename E1>
+template <typename T, typename E1, KFR_ARCH_DEP>
 struct expression_fir : expression<E1>
 {
     expression_fir(E1&& e1, const array_ref<const T>& taps)
@@ -71,14 +75,14 @@ struct expression_fir : expression<E1>
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N> x) const
     {
         const size_t tapcount = taps.size();
         const vec<T, N> input = cast<T>(this->argument_first(index, x));
 
         vec<T, N> output;
         size_t cursor = delayline_cursor;
-        KFR_LOOP_NOUNROLL
+        CMT_LOOP_NOUNROLL
         for (size_t i = 0; i < N; i++)
         {
             delayline.ringbuf_write(cursor, input[i]);
@@ -95,12 +99,12 @@ struct expression_fir : expression<E1>
 }
 
 template <typename T, typename E1, size_t Tag>
-KFR_INLINE internal::expression_fir<T, E1> fir(E1&& e1, const univector<T, Tag>& taps)
+CMT_INLINE internal::expression_fir<T, E1> fir(E1&& e1, const univector<T, Tag>& taps)
 {
     return internal::expression_fir<T, E1>(std::forward<E1>(e1), taps.ref());
 }
 template <typename T, size_t TapCount, typename E1>
-KFR_INLINE internal::expression_short_fir<TapCount, T, E1> short_fir(E1&& e1,
+CMT_INLINE internal::expression_short_fir<TapCount, T, E1> short_fir(E1&& e1,
                                                                      const univector<T, TapCount>& taps)
 {
     static_assert(TapCount >= 1 && TapCount < 16, "Use short_fir only for small FIR filters");
diff --git a/include/kfr/dsp/fir_design.hpp b/include/kfr/dsp/fir_design.hpp
@@ -120,25 +120,25 @@ KFR_I_FN(fir_bandpass)
 KFR_I_FN(fir_bandstop)
 
 template <typename T, size_t Tag>
-KFR_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
+CMT_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
                             bool normalize = true)
 {
     return intrinsics::fir_lowpass(taps.slice(), cutoff, window, normalize);
 }
 template <typename T, size_t Tag>
-KFR_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
+CMT_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
                              bool normalize = true)
 {
     return intrinsics::fir_highpass(taps.slice(), cutoff, window, normalize);
 }
 template <typename T, size_t Tag>
-KFR_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
+CMT_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
                              const expression_pointer<T>& window, bool normalize = true)
 {
     return intrinsics::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize);
 }
 template <typename T, size_t Tag>
-KFR_INLINE void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
+CMT_INLINE void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
                              const expression_pointer<T>& window, bool normalize = true)
 {
     return intrinsics::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize);
diff --git a/include/kfr/dsp/fracdelay.hpp b/include/kfr/dsp/fracdelay.hpp
@@ -28,12 +28,10 @@ namespace kfr
 {
 
 template <typename T, typename E1>
-KFR_INLINE internal::expression_short_fir<2, T, E1> fracdelay(E1&& e1, T delay)
+CMT_INLINE internal::expression_short_fir<2, T, E1> fracdelay(E1&& e1, T delay)
 {
     if (delay < 0)
         delay = 0;
-    if (delay > 1)
-        delay = fract(delay);
     univector<T, 2> taps({ 1 - delay, delay });
     return internal::expression_short_fir<2, T, E1>(std::forward<E1>(e1), taps.ref());
 }
diff --git a/include/kfr/dsp/goertzel.hpp b/include/kfr/dsp/goertzel.hpp
@@ -32,95 +32,86 @@ namespace kfr
 namespace internal
 {
 
-template <cpu_t c = cpu_t::native, cpu_t cc = c>
-struct in_goertzel : in_sin_cos<cc>
+template <typename T, KFR_ARCH_DEP>
+struct expression_goertzel : output_expression
 {
-private:
-    using in_sin_cos<cc>::sin;
-    using in_sin_cos<cc>::cos;
-
-public:
-    template <typename T>
-    struct expression_goertzel : output_expression
+    expression_goertzel(complex<T>& result, T omega)
+        : result(result), omega(omega), coeff(2 * cos(omega)), q0(), q1(), q2()
     {
-        expression_goertzel(complex<T>& result, identity<T> omega)
-            : result(result), omega(omega), coeff(2 * cos(omega)), q0(), q1(), q2()
-        {
-        }
-        ~expression_goertzel()
-        {
-            result.real(q1 - q2 * cos(omega));
-            result.imag(q2 * sin(omega));
-        }
-        template <typename U, size_t N>
-        KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+    }
+    ~expression_goertzel()
+    {
+        result.real(q1 - q2 * cos(omega));
+        result.imag(q2 * sin(omega));
+    }
+    template <typename U, size_t N>
+    CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+    {
+        vec<T, N> in = x;
+        CMT_LOOP_UNROLL
+        for (size_t i = 0; i < N; i++)
         {
-            vec<T, N> in = cast<T>(x);
-            KFR_LOOP_UNROLL
-            for (size_t i = 0; i < N; i++)
-            {
-                q0 = coeff * q1 - q2 + in[i];
-                q2 = q1;
-                q1 = q0;
-            }
+            q0 = coeff * q1 - q2 + in[i];
+            q2 = q1;
+            q1 = q0;
         }
-        complex<T>& result;
-        const T omega;
-        const T coeff;
-        T q0;
-        T q1;
-        T q2;
-    };
+    }
+    complex<T>& result;
+    const T omega;
+    const T coeff;
+    T q0;
+    T q1;
+    T q2;
+};
 
-    template <typename T, size_t width>
-    struct expression_parallel_goertzel : output_expression
+template <typename T, size_t width>
+struct expression_parallel_goertzel : output_expression
+{
+    expression_parallel_goertzel(complex<T> result[], vec<T, width> omega)
+        : result(result), omega(omega), coeff(cos(omega)), q0(), q1(), q2()
     {
-        expression_parallel_goertzel(complex<T> result[], vec<T, width> omega)
-            : result(result), omega(omega), coeff(cos(omega)), q0(), q1(), q2()
-        {
-        }
-        ~expression_parallel_goertzel()
-        {
-            const vec<T, width> re = q1 - q2 * cos(omega);
-            const vec<T, width> im = q2 * sin(omega);
-            for (size_t i = 0; i < width; i++)
-            {
-                result[i].real(re[i]);
-                result[i].imag(im[i]);
-            }
-        }
-        template <typename U, size_t N>
-        KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+    }
+    ~expression_parallel_goertzel()
+    {
+        const vec<T, width> re = q1 - q2 * cos(omega);
+        const vec<T, width> im = q2 * sin(omega);
+        for (size_t i = 0; i < width; i++)
         {
-            const vec<T, N> in = cast<T>(x);
-            KFR_LOOP_UNROLL
-            for (size_t i = 0; i < N; i++)
-            {
-                q0 = coeff * q1 - q2 + in[i];
-                q2 = q1;
-                q1 = q0;
-            }
+            result[i].real(re[i]);
+            result[i].imag(im[i]);
         }
-        complex<T> result[];
-        const vec<T, width> omega;
-        const vec<T, width> coeff;
-        vec<T, width> q0;
-        vec<T, width> q1;
-        vec<T, width> q2;
-    };
-
-    template <typename T>
-    KFR_SINTRIN expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega)
-    {
-        return expression_goertzel<T>(result, omega);
     }
-
-    template <typename T, size_t width>
-    KFR_SINTRIN expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width],
-                                                                const T (&omega)[width])
+    template <typename U, size_t N>
+    CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
     {
-        return expression_parallel_goertzel<T, width>(result, read<width>(omega));
+        const vec<T, N> in = x;
+        CMT_LOOP_UNROLL
+        for (size_t i = 0; i < N; i++)
+        {
+            q0 = coeff * q1 - q2 + in[i];
+            q2 = q1;
+            q1 = q0;
+        }
     }
+    complex<T> result[];
+    const vec<T, width> omega;
+    const vec<T, width> coeff;
+    vec<T, width> q0;
+    vec<T, width> q1;
+    vec<T, width> q2;
+};
 };
+
+template <typename T>
+KFR_SINTRIN internal::expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega)
+{
+    return internal::expression_goertzel<T>(result, omega);
+}
+
+template <typename T, size_t width>
+KFR_SINTRIN internal::expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width],
+                                                                      const T (&omega)[width])
+{
+    return internal::expression_parallel_goertzel<T, width>(result, read<width>(omega));
 }
 }
diff --git a/include/kfr/dsp/interpolation.hpp b/include/kfr/dsp/interpolation.hpp
@@ -32,7 +32,7 @@ namespace kfr
 template <typename T, typename M>
 KFR_SINTRIN T nearest(M mu, T x1, T x2)
 {
-    return native::select(mu < M(0.5), x1, x2);
+    return select(mu < M(0.5), x1, x2);
 }
 
 template <typename T, typename M>
@@ -44,7 +44,7 @@ KFR_SINTRIN T linear(M mu, T x1, T x2)
 template <typename T, typename M>
 KFR_SINTRIN T cosine(M mu, T x1, T x2)
 {
-    return mix((M(1) - native::fastcos(mu * c_pi<T>)) * M(0.5), x1, x2);
+    return mix((M(1) - fastcos(mu * c_pi<T>)) * M(0.5), x1, x2);
 }
 
 template <typename T, typename M>
diff --git a/include/kfr/dsp/mixdown.hpp b/include/kfr/dsp/mixdown.hpp
@@ -0,0 +1,62 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../base.hpp"
+
+namespace kfr
+{
+
+template <typename... E>
+internal::expression_function<fn_add, E...> mixdown(E&&... e)
+{
+    return internal::expression_function<fn_add, E...>(fn_add(), std::forward<E>(e)...);
+}
+
+namespace internal
+{
+struct stereo_matrix
+{
+    template <typename T, size_t N>
+    CMT_INLINE vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const
+    {
+        return process(x, csizeseq<N>);
+    }
+    template <typename T, size_t N, size_t... indices>
+    CMT_INLINE vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const
+    {
+        return vec<vec<T, 2>, N>(hadd(transpose(x[indices] * matrix))...);
+    }
+    const f64x2x2 matrix;
+};
+}
+
+template <typename Left, typename Right,
+          typename Result = internal::expression_function<
+              internal::stereo_matrix, internal::expression_pack<internal::arg<Left>, internal::arg<Right>>>>
+Result mixdown_stereo(Left&& left, Right&& right, const f64x2x2& matrix)
+{
+    return Result(internal::stereo_matrix{ matrix },
+                  pack(std::forward<Left>(left), std::forward<Right>(right)));
+}
+}
diff --git a/include/kfr/dsp/resample.hpp b/include/kfr/dsp/resample.hpp
@@ -22,190 +22,4 @@
  */
 #pragma once
 
-#include "../base/function.hpp"
-#include "../base/memory.hpp"
-#include "../base/reduce.hpp"
-#include "../base/vec.hpp"
-#include "window.hpp"
-
-namespace kfr
-{
-namespace resample_quality
-{
-constexpr csize_t<4> draft{};
-constexpr csize_t<6> low{};
-constexpr csize_t<8> normal{};
-constexpr csize_t<10> high{};
-}
-
-namespace internal
-{
-template <typename T1, typename T2>
-KFR_SINTRIN T1 resample_blackman(T1 n, T2 a)
-{
-    const T1 a0 = (1 - a) * 0.5;
-    const T1 a1 = 0.5;
-    const T1 a2 = a * 0.5;
-    n           = n * c_pi<T1, 2>;
-    return a0 - a1 * cos(n) + a2 * cos(2 * n);
-}
-
-template <typename T, size_t quality>
-struct resampler
-{
-    using itype = i64;
-
-    constexpr static itype depth = static_cast<itype>(1 << (quality + 1));
-
-    resampler(itype interpolation_factor, itype decimation_factor, T scale = T(1), T cutoff = 0.49)
-        : input_position(0), output_position(0)
-    {
-        const i64 gcf = gcd(interpolation_factor, decimation_factor);
-        interpolation_factor /= gcf;
-        decimation_factor /= gcf;
-
-        taps  = depth * interpolation_factor;
-        order = size_t(depth * interpolation_factor - 1);
-
-        this->interpolation_factor = interpolation_factor;
-        this->decimation_factor    = decimation_factor;
-
-        const itype halftaps = taps / 2;
-        filter               = univector<T>(size_t(taps), T());
-        delay                = univector<T>(size_t(depth), T());
-
-        cutoff = cutoff / std::max(decimation_factor, interpolation_factor);
-
-        for (itype j = 0, jj = 0; j < taps; j++)
-        {
-            filter[size_t(j)] = scale * 2 * interpolation_factor * cutoff *
-                                sinc((jj - halftaps) * cutoff * c_pi<T, 2>) *
-                                resample_blackman(T(jj) / T(taps - 1), T(0.16));
-            jj += size_t(interpolation_factor);
-            if (jj >= taps)
-                jj = jj - taps + 1;
-        }
-
-        const T s = reciprocal(sum(filter)) * interpolation_factor;
-        filter    = filter * s;
-    }
-    KFR_INLINE size_t operator()(T* dest, size_t zerosize)
-    {
-        size_t outputsize   = 0;
-        const itype srcsize = itype(zerosize);
-
-        for (size_t i = 0;; i++)
-        {
-            const itype ii                 = itype(i) + output_position;
-            const itype workindex          = ii * (decimation_factor);
-            const itype workindex_rem      = workindex % (interpolation_factor);
-            const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
-            itype srcindex                 = workindex / (interpolation_factor);
-            srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
-            const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
-            srcindex                       = srcindex - (depth - 1);
-
-            if (srcindex + depth >= input_position + srcsize)
-                break;
-            outputsize++;
-
-            if (dest)
-            {
-                if (srcindex >= input_position)
-                {
-                    dest[i] = T(0);
-                }
-                else
-                {
-                    const itype prev_count = input_position - srcindex;
-                    dest[i]                = dotproduct(delay.slice(size_t(depth - prev_count)), tap_ptr);
-                }
-            }
-        }
-        if (srcsize >= depth)
-        {
-            delay = zeros();
-        }
-        else
-        {
-            delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
-            delay.slice(size_t(depth - srcsize)) = zeros();
-        }
-
-        input_position += srcsize;
-        output_position += outputsize;
-        return outputsize;
-    }
-    KFR_INLINE size_t operator()(T* dest, univector_ref<const T> src)
-    {
-        size_t outputsize   = 0;
-        const itype srcsize = itype(src.size());
-
-        for (size_t i = 0;; i++)
-        {
-            const itype ii                 = itype(i) + output_position;
-            const itype workindex          = ii * (decimation_factor);
-            const itype workindex_rem      = workindex % (interpolation_factor);
-            const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
-            itype srcindex                 = workindex / (interpolation_factor);
-            srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
-            const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
-            srcindex                       = srcindex - (depth - 1);
-
-            if (srcindex + depth >= input_position + srcsize)
-                break;
-            outputsize++;
-
-            if (dest)
-            {
-                if (srcindex >= input_position)
-                {
-                    dest[i] = dotproduct(src.slice(size_t(srcindex - input_position), size_t(depth)),
-                                         tap_ptr /*, depth*/);
-                }
-                else
-                {
-                    const itype prev_count = input_position - srcindex;
-                    dest[i] =
-                        dotproduct(delay.slice(size_t(depth - prev_count)),
-                                   tap_ptr /*, size_t(prev_count)*/) +
-                        dotproduct(
-                            src, tap_ptr.slice(size_t(prev_count),
-                                               size_t(depth - prev_count)) /*, size_t(depth - prev_count)*/);
-                }
-            }
-        }
-        if (srcsize >= depth)
-        {
-            delay = src.slice(size_t(srcsize - depth));
-        }
-        else
-        {
-            delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
-            delay.slice(size_t(depth - srcsize)) = src;
-        }
-
-        input_position += srcsize;
-        output_position += outputsize;
-        return outputsize;
-    }
-    itype taps;
-    size_t order;
-    itype interpolation_factor;
-    itype decimation_factor;
-    univector<T> filter;
-    univector<T> delay;
-    itype input_position;
-    itype output_position;
-};
-}
-
-template <typename T, size_t quality>
-inline internal::resampler<T, quality> resampler(csize_t<quality>, size_t interpolation_factor,
-                                                 size_t decimation_factor, T scale = T(1), T cutoff = 0.49)
-{
-    using itype = typename internal::resampler<T, quality>::itype;
-    return internal::resampler<T, quality>(itype(interpolation_factor), itype(decimation_factor), scale,
-                                           cutoff);
-}
-}
+#include "sample_rate_conversion.hpp"
diff --git a/include/kfr/dsp/sample_rate_conversion.hpp b/include/kfr/dsp/sample_rate_conversion.hpp
@@ -0,0 +1,227 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../base/function.hpp"
+#include "../base/memory.hpp"
+#include "../base/reduce.hpp"
+#include "../base/vec.hpp"
+#include "window.hpp"
+
+namespace kfr
+{
+namespace sample_rate_conversion_quality
+{
+constexpr csize_t<4> draft{};
+constexpr csize_t<6> low{};
+constexpr csize_t<8> normal{};
+constexpr csize_t<10> high{};
+}
+
+namespace resample_quality = sample_rate_conversion_quality;
+
+namespace internal
+{
+template <typename T1, typename T2>
+KFR_SINTRIN T1 sample_rate_converter_blackman(T1 n, T2 a)
+{
+    const T1 a0 = (1 - a) * 0.5;
+    const T1 a1 = 0.5;
+    const T1 a2 = a * 0.5;
+    n           = n * c_pi<T1, 2>;
+    return a0 - a1 * cos(n) + a2 * cos(2 * n);
+}
+
+template <typename T, size_t quality, KFR_ARCH_DEP>
+struct sample_rate_converter
+{
+    using itype = i64;
+
+    constexpr static itype depth = static_cast<itype>(1 << (quality + 1));
+
+    sample_rate_converter(itype interpolation_factor, itype decimation_factor, T scale = T(1),
+                          T cutoff = 0.49)
+        : input_position(0), output_position(0)
+    {
+        const i64 gcf = gcd(interpolation_factor, decimation_factor);
+        interpolation_factor /= gcf;
+        decimation_factor /= gcf;
+
+        taps  = depth * interpolation_factor;
+        order = size_t(depth * interpolation_factor - 1);
+
+        this->interpolation_factor = interpolation_factor;
+        this->decimation_factor    = decimation_factor;
+
+        const itype halftaps = taps / 2;
+        filter               = univector<T>(size_t(taps), T());
+        delay                = univector<T>(size_t(depth), T());
+
+        cutoff = cutoff / std::max(decimation_factor, interpolation_factor);
+
+        for (itype j = 0, jj = 0; j < taps; j++)
+        {
+            filter[size_t(j)] = scale * 2 * interpolation_factor * cutoff *
+                                sinc((jj - halftaps) * cutoff * c_pi<T, 2>) *
+                                sample_rate_converter_blackman(T(jj) / T(taps - 1), T(0.16));
+            jj += size_t(interpolation_factor);
+            if (jj >= taps)
+                jj = jj - taps + 1;
+        }
+
+        const T s = reciprocal(sum(filter)) * interpolation_factor;
+        filter    = filter * s;
+    }
+    CMT_INLINE size_t operator()(T* dest, size_t zerosize)
+    {
+        size_t outputsize   = 0;
+        const itype srcsize = itype(zerosize);
+
+        for (size_t i = 0;; i++)
+        {
+            const itype ii                 = itype(i) + output_position;
+            const itype workindex          = ii * (decimation_factor);
+            const itype workindex_rem      = workindex % (interpolation_factor);
+            const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
+            itype srcindex                 = workindex / (interpolation_factor);
+            srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
+            const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
+            srcindex                       = srcindex - (depth - 1);
+
+            if (srcindex + depth >= input_position + srcsize)
+                break;
+            outputsize++;
+
+            if (dest)
+            {
+                if (srcindex >= input_position)
+                {
+                    dest[i] = T(0);
+                }
+                else
+                {
+                    const itype prev_count = input_position - srcindex;
+                    dest[i]                = dotproduct(delay.slice(size_t(depth - prev_count)), tap_ptr);
+                }
+            }
+        }
+        if (srcsize >= depth)
+        {
+            delay = zeros();
+        }
+        else
+        {
+            delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
+            delay.slice(size_t(depth - srcsize)) = zeros();
+        }
+
+        input_position += srcsize;
+        output_position += outputsize;
+        return outputsize;
+    }
+    CMT_INLINE size_t operator()(T* dest, univector_ref<const T> src)
+    {
+        size_t outputsize   = 0;
+        const itype srcsize = itype(src.size());
+
+        for (size_t i = 0;; i++)
+        {
+            const itype ii                 = itype(i) + output_position;
+            const itype workindex          = ii * (decimation_factor);
+            const itype workindex_rem      = workindex % (interpolation_factor);
+            const itype start              = workindex_rem ? (interpolation_factor)-workindex_rem : 0;
+            itype srcindex                 = workindex / (interpolation_factor);
+            srcindex                       = workindex_rem ? srcindex + 1 : srcindex;
+            const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(start * depth));
+            srcindex                       = srcindex - (depth - 1);
+
+            if (srcindex + depth >= input_position + srcsize)
+                break;
+            outputsize++;
+
+            if (dest)
+            {
+                if (srcindex >= input_position)
+                {
+                    dest[i] = dotproduct(src.slice(size_t(srcindex - input_position), size_t(depth)),
+                                         tap_ptr /*, depth*/);
+                }
+                else
+                {
+                    const itype prev_count = input_position - srcindex;
+                    dest[i] =
+                        dotproduct(delay.slice(size_t(depth - prev_count)),
+                                   tap_ptr /*, size_t(prev_count)*/) +
+                        dotproduct(
+                            src, tap_ptr.slice(size_t(prev_count),
+                                               size_t(depth - prev_count)) /*, size_t(depth - prev_count)*/);
+                }
+            }
+        }
+        if (srcsize >= depth)
+        {
+            delay = src.slice(size_t(srcsize - depth));
+        }
+        else
+        {
+            delay.slice(0, size_t(depth - srcsize)) = delay.slice(size_t(srcsize));
+            delay.slice(size_t(depth - srcsize)) = src;
+        }
+
+        input_position += srcsize;
+        output_position += outputsize;
+        return outputsize;
+    }
+    itype taps;
+    size_t order;
+    itype interpolation_factor;
+    itype decimation_factor;
+    univector<T> filter;
+    univector<T> delay;
+    itype input_position;
+    itype output_position;
+};
+}
+
+template <typename T, size_t quality>
+inline internal::sample_rate_converter<T, quality> sample_rate_converter(csize_t<quality>,
+                                                                         size_t interpolation_factor,
+                                                                         size_t decimation_factor,
+                                                                         T scale = T(1), T cutoff = 0.49)
+{
+    using itype = typename internal::sample_rate_converter<T, quality>::itype;
+    return internal::sample_rate_converter<T, quality>(itype(interpolation_factor), itype(decimation_factor),
+                                                       scale, cutoff);
+}
+
+// Deprecated in 0.9.2
+template <typename T, size_t quality>
+inline internal::sample_rate_converter<T, quality> resampler(csize_t<quality>, size_t interpolation_factor,
+                                                             size_t decimation_factor, T scale = T(1),
+                                                             T cutoff = 0.49)
+{
+    using itype = typename internal::sample_rate_converter<T, quality>::itype;
+    return internal::sample_rate_converter<T, quality>(itype(interpolation_factor), itype(decimation_factor),
+                                                       scale, cutoff);
+}
+}
diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp
@@ -41,7 +41,7 @@ namespace intrinsics
 template <typename T, typename TF = ftype<T>>
 KFR_SINTRIN TF amp_to_dB(T amp)
 {
-    return log(cast<subtype<TF>>(amp)) * subtype<TF>(8.6858896380650365530225783783322);
+    return log(static_cast<TF>(amp)) * subtype<TF>(8.6858896380650365530225783783322);
     // return T( 20.0 ) * log10( level );
 }
 
diff --git a/include/kfr/dsp/waveshaper.hpp b/include/kfr/dsp/waveshaper.hpp
@@ -22,21 +22,20 @@
  */
 #pragma once
 
-#include "../base/abs.hpp"
+#include "../base/clamp.hpp"
 #include "../base/hyperbolic.hpp"
-#include "../base/min_max.hpp"
 
 namespace kfr
 {
 template <typename E1>
 inline auto waveshaper_hardclip(E1&& input, double clip_level)
 {
-    return native::clamp(input, -clip_level, +clip_level);
+    return clamp(input, -clip_level, +clip_level);
 }
 
 template <typename E1>
 inline auto waveshaper_tanh(E1&& input, double saturation)
 {
-    return native::tanh(saturation * input) * (native::coth(saturation));
+    return tanh(saturation * input) * (coth(saturation));
 }
 }
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -122,7 +122,7 @@ struct expression_rectangular : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         using UI = utype<U>;
         const vec<UI, N> i = enumerate(vec<UI, N>()) + cast<UI>(index);
@@ -144,10 +144,10 @@ struct expression_triangular : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(1 - abs(linspace(cinput, index, y)));
+        return 1 - abs(linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -166,10 +166,10 @@ struct expression_bartlett : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(1 - abs(linspace(cinput, index, y)));
+        return 1 - abs(linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -188,10 +188,10 @@ struct expression_cosine : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(sin(c_pi<T> * linspace(cinput, index, y)));
+        return sin(c_pi<T> * linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -210,10 +210,10 @@ struct expression_hann : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y))));
+        return T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y)));
     }
     size_t size() const { return m_size; }
 
@@ -232,11 +232,11 @@ struct expression_bartlett_hann : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
         const vec<T, N> xx = linspace(cinput, index, y);
-        return cast<U>(T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5))));
+        return T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5)));
     }
     size_t size() const { return m_size; }
 
@@ -255,10 +255,10 @@ struct expression_hamming : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(alpha - (1.0 - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y))));
+        return alpha - (1.0 - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y)));
     }
     size_t size() const { return m_size; }
 
@@ -278,11 +278,11 @@ struct expression_bohman : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
         const vec<U, N> n = abs(linspace(cinput, index, y));
-        return cast<U>((T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n));
+        return (T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n);
     }
     size_t size() const { return m_size; }
 
@@ -301,11 +301,11 @@ struct expression_blackman : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
         const vec<T, N> n = linspace(cinput, index, y);
-        return cast<U>(a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n));
+        return a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n);
     }
     size_t size() const { return m_size; }
 
@@ -325,12 +325,12 @@ struct expression_blackman_harris : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
         const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
 
-        return cast<U>(T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n));
+        return T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n);
     }
     size_t size() const { return m_size; }
 
@@ -350,10 +350,10 @@ struct expression_kaiser : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m);
+        return modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m;
     }
     size_t size() const { return m_size; }
 
@@ -374,7 +374,7 @@ struct expression_flattop : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
         const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
@@ -383,7 +383,7 @@ struct expression_flattop : input_expression
         constexpr T a2 = 1.29;
         constexpr T a3 = 0.388;
         constexpr T a4 = 0.028;
-        return cast<U>(a0 - a1 * cos(n) + a2 * cos(2 * n) - a3 * cos(3 * n) + a4 * cos(4 * n));
+        return a0 - a1 * cos(n) + a2 * cos(2 * n) - a3 * cos(3 * n) + a4 * cos(4 * n);
     }
     size_t size() const { return m_size; }
 
@@ -402,10 +402,10 @@ struct expression_gaussian : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(exp(-0.5 * sqr(alpha * linspace(cinput, index, y))));
+        return exp(-0.5 * sqr(alpha * linspace(cinput, index, y)));
     }
 
     size_t size() const { return m_size; }
@@ -425,10 +425,10 @@ struct expression_lanczos : input_expression
     {
     }
     template <typename U, size_t N>
-    KFR_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
     {
         constexpr vec_t<T, N> y{};
-        return cast<U>(sinc(linspace(cinput, index, y)));
+        return sinc(linspace(cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -462,87 +462,88 @@ KFR_WINDOW_BY_TYPE(kaiser)
 KFR_WINDOW_BY_TYPE(flattop)
 KFR_WINDOW_BY_TYPE(gaussian)
 KFR_WINDOW_BY_TYPE(lanczos)
+#undef KFR_WINDOW_BY_TYPE
 }
 
-KFR_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t size)
+CMT_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t size)
 {
     return internal::expression_rectangular<fbase>(size, fbase());
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_triangular<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_bartlett<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_cosine<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_hann<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_bartlett_hann<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_hamming<T> window_hamming(size_t size, T alpha = 0.54,
+CMT_INLINE internal::expression_hamming<T> window_hamming(size_t size, identity<T> alpha = 0.54,
                                                           ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_hamming<T>(size, alpha);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_bohman<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_blackman<T> window_blackman(
-    size_t size, T alpha = 0.16, window_symmetry symmetry = window_symmetry::symmetric,
+CMT_INLINE internal::expression_blackman<T> window_blackman(
+    size_t size, identity<T> alpha = 0.16, window_symmetry symmetry = window_symmetry::symmetric,
     ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_blackman<T>(size, alpha, symmetry);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_blackman_harris<T> window_blackman_harris(
+CMT_INLINE internal::expression_blackman_harris<T> window_blackman_harris(
     size_t size, window_symmetry symmetry = window_symmetry::symmetric, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_blackman_harris<T>(size, T(), symmetry);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, T beta = T(0.5),
+CMT_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> beta = T(0.5),
                                                         ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_kaiser<T>(size, beta);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_flattop<T>(size);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, T alpha = 2.5,
+CMT_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, identity<T> alpha = 2.5,
                                                             ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_gaussian<T>(size, alpha);
 }
 template <typename T = fbase>
-KFR_INLINE internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
+CMT_INLINE internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_lanczos<T>(size);
 }
 
 template <typename T           = fbase, window_type type,
           typename window_expr = typename internal::window_by_type<type>::template type<T>>
-KFR_NOINLINE window_expr window(size_t size, cval_t<window_type, type>, T win_param = T(),
+CMT_NOINLINE window_expr window(size_t size, cval_t<window_type, type>, identity<T> win_param = T(),
                                 window_symmetry symmetry = window_symmetry::symmetric,
                                 ctype_t<T>               = ctype_t<T>())
 {
@@ -550,7 +551,7 @@ KFR_NOINLINE window_expr window(size_t size, cval_t<window_type, type>, T win_pa
 }
 
 template <typename T = fbase>
-KFR_NOINLINE expression_pointer<T> window(size_t size, window_type type, T win_param,
+CMT_NOINLINE expression_pointer<T> window(size_t size, window_type type, identity<T> win_param,
                                           window_symmetry symmetry = window_symmetry::symmetric,
                                           ctype_t<T>               = ctype_t<T>())
 {
@@ -561,7 +562,7 @@ KFR_NOINLINE expression_pointer<T> window(size_t size, window_type type, T win_p
               window_type::flattop, window_type::gaussian, window_type::lanczos>,
         type,
         [=](auto win) {
-            constexpr window_type window = val_of(win);
+            constexpr window_type window = val_of(decltype(win)());
             return to_pointer<T>(
                 typename internal::window_by_type<window>::template type<T>(size, win_param, symmetry));
         },
diff --git a/include/kfr/io.hpp b/include/kfr/io.hpp
@@ -0,0 +1,30 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ *
+ * If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ * Buying a commercial license is mandatory as soon as you develop commercial activities without
+ * disclosing the source code of your own applications.
+ * See http://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "base.hpp"
+
+#include "io/audiofile.hpp"
+#include "io/file.hpp"
+#include "io/python_plot.hpp"
+#include "io/tostring.hpp"
diff --git a/include/kfr/io/file.hpp b/include/kfr/io/file.hpp
@@ -85,7 +85,7 @@ struct expression_file_writer : expression_file_base, output_expression
     {
         if (position != index)
             fseeko(file, static_cast<off_t>(index * sizeof(T)), SEEK_SET);
-        const vec<T, N> output = cast<T>(value);
+        const vec<T, N> output = value;
         fwrite(output.data(), sizeof(T), output.size(), file);
         position = index + N;
     }
@@ -104,7 +104,7 @@ struct expression_file_reader : expression_file_base, input_expression
         vec<T, N> input = qnan;
         fread(input.data(), sizeof(T), input.size(), file);
         position = index + N;
-        return cast<U>(input);
+        return input;
     }
     mutable size_t position = 0;
 };
diff --git a/include/kfr/io/python_plot.hpp b/include/kfr/io/python_plot.hpp
@@ -25,7 +25,7 @@
 #include "../cometa/string.hpp"
 #include <cstdlib>
 
-#ifdef KFR_OS_WIN
+#ifdef CMT_OS_WIN
 #include <direct.h>
 #define cross_getcwd _getcwd
 #else
@@ -37,6 +37,10 @@ namespace kfr
 {
 namespace internal
 {
+#pragma clang diagnostic push
+#if CMT_HAS_WARNING("-Wdeprecated-declarations")
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
 
 void python(const std::string& name, const std::string& code)
 {
@@ -46,7 +50,7 @@ void python(const std::string& name, const std::string& code)
         cross_getcwd(curdir, arraysize(curdir));
         filename = curdir;
     }
-#ifdef KFR_OS_WIN
+#ifdef CMT_OS_WIN
     const char* slash = "\\";
 #else
     const char* slash = "/";
@@ -58,6 +62,7 @@ void python(const std::string& name, const std::string& code)
     fclose(f);
     std::system(("python \"" + filename + "\"").c_str());
 }
+#pragma clang diagnostic pop
 }
 
 static std::string concat_args() { return {}; }
diff --git a/include/kfr/io/tostring.hpp b/include/kfr/io/tostring.hpp
@@ -29,6 +29,8 @@
 namespace cometa
 {
 
+inline std::string repr(kfr::cpu_t v);
+
 template <typename T>
 inline std::string repr(const kfr::complex<T>& v);
 
@@ -91,6 +93,8 @@ inline std::string repr(const kfr::complex<T>& v)
     return as_string(v.real()) + " + " + as_string(v.imag()) + "j";
 }
 
+inline std::string repr(kfr::cpu_t v) { return kfr::cpu_name(v); }
+
 template <typename T>
 inline std::string repr(const T* source, size_t N)
 {
@@ -99,7 +103,7 @@ inline std::string repr(const T* source, size_t N)
     {
         if (i > 0)
         {
-            if (i % details::number_columns == 0)
+            if (i % details::number_columns == 0 || kfr::is_vec<T>::value)
                 str += "\n";
             else
                 str += " ";
diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp
@@ -22,25 +22,4 @@
  */
 #pragma once
 
-#include "base/vec.hpp"
-
-#include "base/abs.hpp"
-#include "base/asin_acos.hpp"
-#include "base/atan.hpp"
-#include "base/complex.hpp"
-#include "base/constants.hpp"
-#include "base/digitreverse.hpp"
-#include "base/gamma.hpp"
-#include "base/hyperbolic.hpp"
-#include "base/log_exp.hpp"
-#include "base/logical.hpp"
-#include "base/min_max.hpp"
-#include "base/operators.hpp"
-#include "base/read_write.hpp"
-#include "base/round.hpp"
-#include "base/saturation.hpp"
-#include "base/select.hpp"
-#include "base/shuffle.hpp"
-#include "base/sin_cos.hpp"
-#include "base/sqrt.hpp"
-#include "base/tan.hpp"
+#include "base.hpp"
diff --git a/include/kfr/version.hpp b/include/kfr/version.hpp
@@ -29,7 +29,7 @@ namespace kfr
 {
 static std::string library_version()
 {
-    return "KFR " + std::string(version_string) + " " + CID_STRINGIFY(KFR_ARCH_NAME) +
+    return "KFR " + std::string(version_string) + " " + CMT_STRINGIFY(CMT_ARCH_NAME) +
            bitness_const(" 32-bit", " 64-bit");
 }
 }
diff --git a/sources.cmake b/sources.cmake
@@ -1,97 +1,88 @@
-# Copyright (C) 2016 D Levin (http://www.kfrlib.com)
-# This file is part of KFR
-# 
-# KFR is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-# 
-# KFR is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License
-# along with KFR.
 
+# Auto-generated file. Do not edit
+# Use update-sources.py
 
 set(
     KFR_SRC
     ${PROJECT_SOURCE_DIR}/include/kfr/all.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/cometa.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dsp.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/io.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/version.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cident.h
     ${PROJECT_SOURCE_DIR}/include/kfr/base/abs.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/asin_acos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/atan.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/clamp.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/compiletime.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/complex.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/constants.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/cpuid.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/cpuid_auto.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/digitreverse.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/expression.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/function.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/gamma.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/log_exp.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/hyperbolic.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/logical.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/log_exp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/min_max.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/modzerobessel.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/operators.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/read_write.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/round.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/saturation.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/select.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/shuffle.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/sin_cos.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/specializations.i
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/hyperbolic.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/sqrt.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/tan.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/types.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/vec.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/intrinsics.h
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/kfr.h
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/specializations.i
+    ${PROJECT_SOURCE_DIR}/include/kfr/cometa/string.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/data/bitrev.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/data/sincos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/bitrev.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/conv.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/fft.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/ft.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/reference_dft.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/dft/conv.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/cpuid.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/cpuid_auto.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad_design.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/impulse.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/oscillators.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/units.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/dcremove.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/fir.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/fir_design.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/fracdelay.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/goertzel.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/impulse.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/interpolation.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/mixdown.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/oscillators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/resample.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/sample_rate_conversion.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/speaker.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/units.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/waveshaper.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/weighting.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/window.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/audiofile.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/file.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/python_plot.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/tostring.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/math.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/compiletime.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/version.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/kfr.h
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/intrinsics.h
-    ${PROJECT_SOURCE_DIR}/include/kfr/cometa.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/cometa/string.hpp
-
-    ${PROJECT_SOURCE_DIR}/tests/testo/testo.hpp
-    ${PROJECT_SOURCE_DIR}/tests/testo/print_colored.hpp
 )
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -1,35 +1,48 @@
 # Copyright (C) 2016 D Levin (http://www.kfrlib.com)
 # This file is part of KFR
-# 
+#
 # KFR is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # KFR is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 2.8)
 
 if (NOT MSVC)
     add_compile_options(-fno-exceptions -fno-rtti -ftemplate-backtrace-limit=0)
+    link_libraries(${STD_LIB} pthread m)
+endif ()
+
+include_directories(../include)
+
+if (NOT ARM)
+    add_executable(multiarch multiarch.cpp multiarch_fir_sse2.cpp multiarch_fir_avx.cpp ${KFR_SRC})
+    set_source_files_properties(multiarch_fir_sse2.cpp PROPERTIES COMPILE_FLAGS "-mno-avx -mno-sse3 -msse2")
+    set_source_files_properties(multiarch_fir_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
+endif ()
+
+if (NOT MSVC)
     if (NOT ARCH_FLAGS)
         add_compile_options(-march=native)
     else ()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_FLAGS}")
     endif ()
-    link_libraries(stdc++ pthread m)
 else ()
     add_compile_options(/arch:AVX)
 endif ()
 
-include_directories(../include)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
+
+find_package(MPFR)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
 
@@ -40,8 +53,8 @@ add_executable(intrinsic_test intrinsic_test.cpp ${KFR_SRC})
 add_executable(dft_test dft_test.cpp ${KFR_SRC})
 add_executable(conv_test conv_test.cpp ${KFR_SRC})
 if (MPFR_FOUND)
+    include_directories(${MPFR_INCLUDE_DIR})
     add_executable(transcendental_test transcendental_test.cpp ${KFR_SRC})
-    target_include_directories(transcendental_test PRIVATE ${MPFR_INCLUDE_DIR})
     target_link_libraries(transcendental_test ${MPFR_LIBRARIES})
 endif ()
 add_executable(fracdelay_test fracdelay_test.cpp ${KFR_SRC})
@@ -50,25 +63,29 @@ add_executable(complex_test complex_test.cpp ${KFR_SRC})
 add_executable(vec_test vec_test.cpp ${KFR_SRC})
 add_executable(stat_test stat_test.cpp ${KFR_SRC})
 
-enable_testing()
+if (NOT IOS)
+    enable_testing()
 
-add_test(NAME basic_vector_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/basic_vector_test)
-add_test(NAME intrinsic_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/intrinsic_test)
-add_test(NAME fracdelay_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/fracdelay_test)
-add_test(NAME conv_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/conv_test)
-if (MPFR_FOUND)
-    add_test(NAME transcendental_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/transcendental_test)
+    add_test(NAME basic_vector_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/basic_vector_test)
+    add_test(NAME intrinsic_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/intrinsic_test)
+    add_test(NAME fracdelay_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/fracdelay_test)
+    add_test(NAME conv_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/conv_test)
+    if (MPFR_FOUND)
+        add_test(NAME transcendental_test
+                COMMAND ${PROJECT_BINARY_DIR}/tests/transcendental_test)
+    endif ()
+    add_test(NAME complex_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/complex_test)
+    add_test(NAME vec_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/vec_test)
+    add_test(NAME stat_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/stat_test)
+    add_test(NAME multiarch
+            COMMAND ${PROJECT_BINARY_DIR}/tests/multiarch)
+    add_test(NAME dft_test
+            COMMAND ${PROJECT_BINARY_DIR}/tests/dft_test)
 endif ()
-add_test(NAME complex_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/complex_test)
-add_test(NAME vec_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/vec_test)
-add_test(NAME stat_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/stat_test)
-add_test(NAME dft_test
-        COMMAND ${PROJECT_BINARY_DIR}/tests/dft_test)
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -44,17 +44,17 @@ TEST(complex_vector)
 
 TEST(complex_cast)
 {
-    const vec<f32, 4> v1 = subcast<f32>(make_vector(c32{ 0, 1 }, c32{ 2, 3 }));
+    const vec<f32, 4> v1 = bitcast<f32>(make_vector(c32{ 0, 1 }, c32{ 2, 3 }));
     CHECK(v1(0) == 0.f);
     CHECK(v1(1) == 1.f);
     CHECK(v1(2) == 2.f);
     CHECK(v1(3) == 3.f);
 
-    const vec<c32, 1> v2 = subcast<c32>(make_vector(1.f, 2.f));
+    const vec<c32, 1> v2 = bitcast<c32>(make_vector(1.f, 2.f));
     CHECK(v2(0) == 1.f);
     CHECK(v2(1) == 2.f);
 
-    const vec<c32, 2> v3 = cast<c32>(make_vector(1.f, 2.f));
+    const vec<c32, 2> v3 = make_vector(1.f, 2.f);
     CHECK(v3(0) == 1.f);
     CHECK(v3(1) == 0.f);
     CHECK(v3(2) == 2.f);
@@ -101,6 +101,15 @@ TEST(complex_math)
     CHECK(cexp(c32{ 1.f, 1.f }) == c32{ 1.4686939399158849, 2.2873552871788423 });
     CHECK(cexp2(c32{ 1.f, 1.f }) == c32{ 1.5384778027279442, 1.2779225526272695 });
     CHECK(cexp10(c32{ 1.f, 1.f }) == c32{ -6.682015101903131, 7.439803369574931 });
+
+#ifdef KFR_NATIVE_F64
+    CHECK(csin(c64{ 1.f, 1.f }) == c64{ 1.2984575814159773, 0.634963914784736 });
+    CHECK(ccos(c64{ 1.f, 1.f }) == c64{ 0.8337300251311489, -0.9888977057628651 });
+    CHECK(csinh(c64{ 1.f, 1.f }) == c64{ 0.634963914784736, 1.2984575814159773 });
+    CHECK(ccosh(c64{ 1.f, 1.f }) == c64{ 0.8337300251311489, 0.9888977057628651 });
+    CHECK(clog(c64{ 1.f, 1.f }) == c64{ 0.34657359027997264, 0.7853981633974483 });
+    CHECK(cexp(c64{ 1.f, 1.f }) == c64{ 1.4686939399158849, 2.2873552871788423 });
+#endif
 }
 
 TEST(complex_read_write)
@@ -168,10 +177,12 @@ int main(int argc, char** argv)
 {
     println(library_version());
 
+#ifdef CMT_ARCH_SSE2
     static_assert(vector_width<f32, cpu_t::sse2> == 4, "");
     static_assert(vector_width<c32, cpu_t::sse2> == 2, "");
     static_assert(vector_width<i32, cpu_t::sse2> == 4, "");
     static_assert(vector_width<complex<i32>, cpu_t::sse2> == 2, "");
+#endif
 
     static_assert(is_numeric<vec<complex<float>, 4>>::value, "");
     static_assert(is_numeric_args<vec<complex<float>, 4>>::value, "");
diff --git a/tests/conv_test.cpp b/tests/conv_test.cpp
@@ -19,11 +19,11 @@ using namespace kfr;
 
 TEST(test_convolve)
 {
-    univector<double, 5> a({ 1, 2, 3, 4, 5 });
-    univector<double, 5> b({ 0.25, 0.5, 1.0, 0.5, 0.25 });
-    univector<double> c = convolve(a, b);
+    univector<fbase, 5> a({ 1, 2, 3, 4, 5 });
+    univector<fbase, 5> b({ 0.25, 0.5, 1.0, 0.5, 0.25 });
+    univector<fbase> c = convolve(a, b);
     CHECK(c.size() == 9);
-    CHECK(rms(c - univector<double>({ 0.25, 1., 2.75, 5., 7.5, 8.5, 7.75, 3.5, 1.25 })) < 0.0001);
+    CHECK(rms(c - univector<fbase>({ 0.25, 1., 2.75, 5., 7.5, 8.5, 7.75, 3.5, 1.25 })) < 0.0001);
 }
 
 int main(int argc, char** argv)
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -22,12 +22,18 @@
 
 using namespace kfr;
 
+#ifdef KFR_NATIVE_F64
+constexpr ctypes_t<float, double> float_types{};
+#else
+constexpr ctypes_t<float> float_types{};
+#endif
+
 TEST(fft_accuracy)
 {
     testo::active_test()->show_progress = true;
     random_bit_generator gen(2247448713, 915890490, 864203735, 2982561);
 
-    testo::matrix(named("type")       = ctypes<float, double>, //
+    testo::matrix(named("type")       = float_types, //
                   named("inverse")    = std::make_tuple(false, true), //
                   named("log2(size)") = make_range(1, 21), //
                   [&gen](auto type, bool inverse, size_t log2size) {
diff --git a/tests/empty_test.cpp b/tests/empty_test.cpp
@@ -1,4 +1,4 @@
-#include <kfr/math.hpp>
+#include <kfr/all.hpp>
 
 using namespace kfr;
 
diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp
@@ -11,39 +11,52 @@
 
 using namespace kfr;
 
-constexpr ctypes_t<i8x1, i16x1, i32x1, i64x1, //
-                   i8x2, i16x2, i32x2, i64x2, //
-                   i8x4, i16x4, i32x4, i64x4, //
-                   i8x8, i16x8, i32x8, i64x8, //
-                   i8x16, i16x16, i32x16, i64x16, //
-                   i8x3, i16x3, i32x3, i64x3 //
+constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, //
+                   i16x1, i16x2, i16x4, i16x8, i16x16, i16x3, //
+                   i32x1, i32x2, i32x4, i32x8, i32x16, i32x3 //
+#ifdef KFR_NATIVE_I64
+                   ,
+                   i64x1, i64x2, i64x4, i64x8, i64x16, i64x3 //
+#endif
                    >
     signed_types{};
 
-constexpr ctypes_t<u8x1, u16x1, u32x1, u64x1, //
-                   u8x2, u16x2, u32x2, u64x2, //
-                   u8x4, u16x4, u32x4, u64x4, //
-                   u8x8, u16x8, u32x8, u64x8, //
-                   u8x16, u16x16, u32x16, u64x16, //
-                   u8x3, u16x3, u32x3, u64x3 //
+constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x3, //
+                   u16x1, u16x2, u16x4, u16x8, u16x16, u16x3, //
+                   u32x1, u32x2, u32x4, u32x8, u32x16, u32x3 //
+#ifdef KFR_NATIVE_I64
+                   ,
+                   u64x1, u64x2, u64x4, u64x8, u64x16, u64x3 //
+#endif
                    >
     unsigned_types{};
 
-constexpr ctypes_t<f32x1, f64x1, //
-                   f32x2, f64x2, //
-                   f32x4, f64x4, //
-                   f32x8, f64x8, //
-                   f32x16, f64x16, //
-                   f32x3, f64x3 //
+constexpr ctypes_t<f32x1, f32x2, f32x4, f32x8, f32x16, f32x3 //
+#ifdef KFR_NATIVE_F64
+                   ,
+                   f64x1, f64x2, f64x4, f64x8, f64x16, f64x3 //
+#endif
                    >
     float_types{};
 
-constexpr ctypes_t<u8x1, i8x1, u16x1, i16x1, u32x1, i32x1, u64x1, i64x1, f32x1, f64x1, //
-                   u8x2, i8x2, u16x2, i16x2, u32x2, i32x2, u64x2, i64x2, f32x2, f64x2, //
-                   u8x4, i8x4, u16x4, i16x4, u32x4, i32x4, u64x4, i64x4, f32x4, f64x4, //
-                   u8x8, i8x8, u16x8, i16x8, u32x8, i32x8, u64x8, i64x8, f32x8, f64x8, //
-                   u8x16, i8x16, u16x16, i16x16, u32x16, i32x16, u64x16, i64x16, f32x16, f64x16, //
-                   u8x3, i8x3, u16x3, i16x3, u32x3, i32x3, u64x3, i64x3, f32x3, f64x3 //
+constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, //
+                   i16x1, i16x2, i16x4, i16x8, i16x16, i16x3, //
+                   i32x1, i32x2, i32x4, i32x8, i32x16, i32x3, //
+#ifdef KFR_NATIVE_I64
+
+                   i64x1, i64x2, i64x4, i64x8, i64x16, i64x3, //
+#endif
+                   u8x1, u8x2, u8x4, u8x8, u8x16, u8x3, //
+                   u16x1, u16x2, u16x4, u16x8, u16x16, u16x3, //
+                   u32x1, u32x2, u32x4, u32x8, u32x16, u32x3, //
+#ifdef KFR_NATIVE_I64
+                   u64x1, u64x2, u64x4, u64x8, u64x16, u64x3, //
+#endif
+                   f32x1, f32x2, f32x4, f32x8, f32x16, f32x3 //
+#ifdef KFR_NATIVE_F64
+                   ,
+                   f64x1, f64x2, f64x4, f64x8, f64x16, f64x3 //
+#endif
                    >
     all_types{};
 
@@ -145,13 +158,13 @@ TEST(intrin_abs)
 
 TEST(intrin_sqrt)
 {
-    testo::assert_is_same<decltype(kfr::sqrt(9)), double>();
-    testo::assert_is_same<decltype(kfr::sqrt(make_vector(9))), f64x1>();
-    testo::assert_is_same<decltype(kfr::sqrt(make_vector(9, 25))), f64x2>();
+    testo::assert_is_same<decltype(kfr::sqrt(9)), fbase>();
+    testo::assert_is_same<decltype(kfr::sqrt(make_vector(9))), vec<fbase, 1>>();
+    testo::assert_is_same<decltype(kfr::sqrt(make_vector(9, 25))), vec<fbase, 2>>();
     CHECK(kfr::sqrt(9) == 3.0);
     CHECK(kfr::sqrt(-9) == qnan);
-    CHECK(kfr::sqrt(make_vector(9)) == make_vector(3.0));
-    CHECK(kfr::sqrt(make_vector(-9)) == make_vector(qnan));
+    CHECK(kfr::sqrt(make_vector(9)) == make_vector<fbase>(3.0));
+    CHECK(kfr::sqrt(make_vector(-9)) == make_vector<fbase>(qnan));
     testo::matrix(named("type") = float_types, named("value") = std::vector<int>{ 0, 2, 65536 },
                   [](auto type, int value) {
                       using T    = type_of<decltype(type)>;
@@ -180,8 +193,8 @@ TEST(intrin_round)
     CHECK(kfr::fract(100) == 0);
 
     testo::matrix(named("type")  = float_types,
-                  named("value") = std::vector<double>{ -1.51, -1.49, 0.0, +1.49, +1.51 },
-                  [](auto type, double value) {
+                  named("value") = std::vector<fbase>{ -1.51, -1.49, 0.0, +1.49, +1.51 },
+                  [](auto type, fbase value) {
                       using T    = type_of<decltype(type)>;
                       using Tsub = subtype<T>;
                       const T x(value);
@@ -201,10 +214,9 @@ TEST(intrin_min_max)
     CHECK(min(pack(1, 2, 3), 2) == pack(1, 2, 2));
     CHECK(min(pack(1., 2., 3.), 2) == pack(1., 2., 2.));
 
-    testo::matrix(named("type") = float_types,
-                  named("value") =
-                      std::vector<std::pair<double, double>>{ { -100, +100 }, { infinity, 0.0 } },
-                  [](auto type, std::pair<double, double> value) {
+    testo::matrix(named("type")  = float_types,
+                  named("value") = std::vector<std::pair<fbase, fbase>>{ { -100, +100 }, { infinity, 0.0 } },
+                  [](auto type, std::pair<fbase, fbase> value) {
                       using T    = type_of<decltype(type)>;
                       using Tsub = subtype<T>;
                       const T x(value.first);
diff --git a/tests/multiarch.cpp b/tests/multiarch.cpp
@@ -0,0 +1,56 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/io/tostring.hpp>
+
+#include "testo/testo.hpp"
+#include <kfr/dsp.hpp>
+
+using namespace kfr;
+
+cpu_t fir_sse2(univector<double, 0> data, univector<double, 4>& taps);
+cpu_t fir_avx(univector<double, 0> data, univector<double, 4>& taps);
+
+TEST(test_fir_sse2)
+{
+    univector<double, 8> data = counter();
+    univector<double, 4> taps({ 0.5, 1.0, 1.0, 0.5 });
+    cpu_t c = fir_sse2(data, taps);
+    CHECK(c == cpu_t::sse2);
+    CHECK(data[0] == 0);
+    CHECK(data[1] == 0.5);
+    CHECK(data[2] == 2);
+    CHECK(data[3] == 4.5);
+    CHECK(data[4] == 7.5);
+    CHECK(data[5] == 10.5);
+    CHECK(data[6] == 13.5);
+    CHECK(data[7] == 16.5);
+}
+
+TEST(test_fir_avx)
+{
+    if (get_cpu() >= cpu_t::avx1)
+    {
+        univector<double, 8> data = counter();
+        univector<double, 4> taps({ 0.5, 1.0, 1.0, 0.5 });
+        cpu_t c = fir_avx(data, taps);
+        CHECK(c == cpu_t::avx);
+        CHECK(data[0] == 0);
+        CHECK(data[1] == 0.5);
+        CHECK(data[2] == 2);
+        CHECK(data[3] == 4.5);
+        CHECK(data[4] == 7.5);
+        CHECK(data[5] == 10.5);
+        CHECK(data[6] == 13.5);
+        CHECK(data[7] == 16.5);
+    }
+    else
+    {
+        println("No AVX");
+    }
+}
+
+int main(int argc, char** argv) { return testo::run_all("", true); }
diff --git a/tests/multiarch_fir_avx.cpp b/tests/multiarch_fir_avx.cpp
@@ -0,0 +1,18 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/dsp.hpp>
+#include <kfr/io/tostring.hpp>
+#include <kfr/version.hpp>
+
+using namespace kfr;
+
+cpu_t fir_avx(univector<double, 0> data, univector<double, 4>& taps)
+{
+    println(library_version());
+    data = short_fir(data, taps);
+    return cpu_t::native;
+}
diff --git a/tests/multiarch_fir_sse2.cpp b/tests/multiarch_fir_sse2.cpp
@@ -0,0 +1,18 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/dsp.hpp>
+#include <kfr/io/tostring.hpp>
+#include <kfr/version.hpp>
+
+using namespace kfr;
+
+cpu_t fir_sse2(univector<double, 0> data, univector<double, 4>& taps)
+{
+    println(library_version());
+    data = short_fir(data, taps);
+    return cpu_t::native;
+}
diff --git a/tests/testo/testo.hpp b/tests/testo/testo.hpp
@@ -372,7 +372,7 @@ struct test_case
 
     void check(bool result, const std::string& value, const char* expr)
     {
-        subtests.push_back(subtest{ result, format("{} | {}", padleft(22, expr), value), comment });
+        subtests.push_back(subtest{ result, as_string(padleft(22, expr), " | ", value), comment });
         result ? success++ : failed++;
         if (show_progress)
         {
@@ -393,8 +393,7 @@ struct test_case
     void check(comparison<Op, L, R> comparison, const char* expr)
     {
         bool result = comparison();
-        check(result, format("{} {} {}", as_string(comparison.left), Op::op(), as_string(comparison.right)),
-              expr);
+        check(result, as_string(comparison.left, " ", Op::op(), " ", comparison.right), expr);
     }
 
     template <typename L>
@@ -409,7 +408,8 @@ struct test_case
         comment = text;
         if (show_progress)
         {
-            printfmt("\n{}:\n", comment);
+            println();
+            println(comment, ":");
         }
     }
 
@@ -469,22 +469,22 @@ template <typename Arg0, typename Fn>
 void matrix(named_arg<Arg0>&& arg0, Fn&& fn)
 {
     cforeach(std::forward<Arg0>(arg0.value), [&](auto v0) {
-        active_test()->set_comment(format("{} = {}", arg0.name, v0));
+        active_test()->set_comment(as_string(arg0.name, " = ", v0));
         fn(v0);
     });
     if (active_test()->show_progress)
-        printfmt("\n");
+        println();
 }
 
 template <typename Arg0, typename Arg1, typename Fn>
 void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, Fn&& fn)
 {
     cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), [&](auto v0, auto v1) {
-        active_test()->set_comment(format("{} = {}, {} = {}", arg0.name, v0, arg1.name, v1));
+        active_test()->set_comment(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1));
         fn(v0, v1);
     });
     if (active_test()->show_progress)
-        printfmt("\n");
+        println();
 }
 
 template <typename Arg0, typename Arg1, typename Arg2, typename Fn>
@@ -493,11 +493,11 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar
     cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), std::forward<Arg2>(arg2.value),
              [&](auto v0, auto v1, auto v2) {
                  active_test()->set_comment(
-                     format("{} = {}, {} = {}, {} = {}", arg0.name, v0, arg1.name, v1, arg2.name, v2));
+                     as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1, ", ", arg2.name, " = ", v2));
                  fn(v0, v1, v2);
              });
     if (active_test()->show_progress)
-        printfmt("\n");
+        println();
 }
 
 static int run_all(const std::string& name = std::string(), bool show_successful = false)
@@ -545,7 +545,7 @@ void assert_is_same_decay()
 #define TESTO_TEST(name)                                                                                     \
     void test_function_##name();                                                                             \
     ::testo::test_case test_case_##name(&test_function_##name, #name);                                       \
-    void CID_NOINLINE test_function_##name()
+    void CMT_NOINLINE test_function_##name()
 
 #define TESTO_DTEST(name)                                                                                    \
     template <typename>                                                                                      \
diff --git a/tests/vec_test.cpp b/tests/vec_test.cpp
@@ -7,6 +7,7 @@
 #include <kfr/io/tostring.hpp>
 
 #include "testo/testo.hpp"
+#include <kfr/dsp/mixdown.hpp>
 #include <kfr/math.hpp>
 
 using namespace kfr;
@@ -64,7 +65,7 @@ TEST(vec_apply)
     CHECK(apply(fn_sqr(), make_vector(1, 2, 3, 4, 5)) == make_vector(1, 4, 9, 16, 25));
 }
 
-#ifdef CID_ARCH_SSE
+#ifdef CMT_ARCH_SSE
 TEST(vec_tovec)
 {
     const __m128 x = _mm_set_ps(4.f, 3.f, 2.f, 1.f);
@@ -132,4 +133,80 @@ TEST(vec_conv)
     testo::assert_is_same<decltype(min(pack(1.0, 2.0, 3.0), pack(1, 2, 3))), f64x3>();
 }
 
+TEST(vec_matrix)
+{
+    using i32x2x2 = vec<vec<int, 2>, 2>;
+    const i32x2x2 m22{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
+    CHECK(m22 * 10 == i32x2x2{ i32x2{ 10, 20 }, i32x2{ 30, 40 } });
+
+    CHECK(m22 * i32x2{ -1, 100 } == i32x2x2{ i32x2{ -1, 200 }, i32x2{ -3, 400 } });
+
+    i32x2 xy{ 10, 20 };
+    i32x2x2 m{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
+    xy = hadd(xy * m);
+    CHECK(xy == i32x2{ 40, 120 });
+
+    i32x2 xy2{ 10, 20 };
+    xy2 = hadd(transpose(xy2 * m));
+    CHECK(xy2 == i32x2{ 50, 110 });
+}
+
+TEST(vec_is_convertible)
+{
+    static_assert(std::is_convertible<float, f32x4>::value, "");
+    static_assert(std::is_convertible<float, f64x8>::value, "");
+    static_assert(std::is_convertible<float, u8x3>::value, "");
+
+    static_assert(std::is_convertible<u16x4, i32x4>::value, "");
+    static_assert(!std::is_convertible<u16x4, i32x3>::value, "");
+    static_assert(!std::is_convertible<u16x1, u16x16>::value, "");
+
+    static_assert(std::is_convertible<float, complex<float>>::value, "");
+    static_assert(std::is_convertible<float, complex<double>>::value, "");
+    static_assert(std::is_convertible<short, complex<double>>::value, "");
+
+    static_assert(std::is_convertible<complex<float>, vec<complex<float>, 4>>::value, "");
+    static_assert(!std::is_convertible<vec<complex<float>, 1>, vec<complex<float>, 4>>::value, "");
+
+    static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, "");
+    static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, "");
+
+    CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f });
+    CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 });
+    CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 });
+
+    CHECK(static_cast<i32x4>(u16x4{ 1, 2, 3, 4 }) == i32x4{ 1, 2, 3, 4 });
+
+    CHECK(static_cast<complex<float>>(10.f) == complex<float>{ 10.f, 0.f });
+    CHECK(static_cast<complex<double>>(10.f) == complex<double>{ 10., 0. });
+    CHECK(static_cast<complex<double>>(static_cast<short>(10)) == complex<double>{ 10., 0. });
+
+    CHECK(static_cast<vec<complex<float>, 4>>(complex<float>{ 1.f, 2.f }) ==
+          vec<complex<float>, 4>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f } });
+
+    CHECK(static_cast<vec<complex<double>, 2>>(vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }) ==
+          vec<complex<double>, 2>{ c64{ 1., 2. }, c64{ 1., 2. } });
+
+    CHECK(static_cast<vec<vec<double, 4>, 2>>(vec<vec<float, 4>, 2>{
+              vec<float, 4>{ 1.f, 2.f, 3.f, 4.f }, vec<float, 4>{ 11.f, 22.f, 33.f, 44.f } }) ==
+          vec<vec<double, 4>, 2>{ vec<double, 4>{ 1., 2., 3., 4. }, vec<double, 4>{ 11., 22., 33., 44. } });
+}
+
+TEST(vec_pack_expr)
+{
+    const univector<float, 20> v1 = 1 + counter();
+    const univector<float, 20> v2 = v1 * 11;
+    const univector<f32x2, 20> v3 = pack(v1, v2);
+    CHECK(v3[0] == f32x2{ 1, 11 });
+    CHECK(v3[1] == f32x2{ 2, 22 });
+    CHECK(v3[18] == f32x2{ 19, 209 });
+    CHECK(v3[19] == f32x2{ 20, 220 });
+
+    const univector<f32x2, 20> v4 = bind_expression(fn_reverse(), v3);
+    CHECK(v4[0] == f32x2{ 11, 1 });
+    CHECK(v4[1] == f32x2{ 22, 2 });
+    CHECK(v4[18] == f32x2{ 209, 19 });
+    CHECK(v4[19] == f32x2{ 220, 20 });
+}
+
 int main(int argc, char** argv) { return testo::run_all("", true); }
diff --git a/update-sources.py b/update-sources.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import fnmatch
+import os
+import subprocess
+import sys
+import glob
+
+path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'include')
+
+masks = ['*.hpp', '*.h', '*.i']
+
+filenames = []
+for root, dirnames, files in os.walk(path, path):
+    for mask in masks:
+        for filename in fnmatch.filter(files, mask):
+            filenames.append(os.path.relpath(os.path.join(root, filename), path).replace('\\','/'))
+
+cmake = """
+# Auto-generated file. Do not edit
+# Use update-sources.py
+
+set(
+    KFR_SRC
+    """ + "\n    ".join(['${PROJECT_SOURCE_DIR}/include/' + f for f in filenames]) + """
+)
+"""
+
+with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sources.cmake'), "w") as f:
+    f.write(cmake)

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	.travis.yml	\|	13	+++----------
M	CMakeLists.txt	\|	20	++++++++++++++++----
M	examples/CMakeLists.txt	\|	6	+++---
M	examples/biquads.cpp	\|	34	+++++++++++++++++-----------------
M	examples/dft.cpp	\|	9	++++-----
M	examples/fir.cpp	\|	12	++++++------
D	examples/resampling.cpp	\|	105	-------------------------------------------------------------------------------
A	examples/sample_rate_conversion.cpp	\|	105	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	examples/window.cpp	\|	2	+-
M	include/kfr/all.hpp	\|	74	++++----------------------------------------------------------------------
A	include/kfr/base.hpp	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/base/abs.hpp	\|	39	++++++++++++++++++++++++++++++++++++---
M	include/kfr/base/asin_acos.hpp	\|	4	++--
M	include/kfr/base/atan.hpp	\|	12	++++++------
M	include/kfr/base/basic_expressions.hpp	\|	53	+++++++++++++++++++++++++++--------------------------
M	include/kfr/base/complex.hpp	\|	98	+++++++++++++++++++++++++++++++++++++++++--------------------------------------
M	include/kfr/base/conversion.hpp	\|	6	+++---
M	include/kfr/base/cpuid.hpp	\|	30	++++++++++++++----------------
M	include/kfr/base/cpuid_auto.hpp	\|	8	++++----
M	include/kfr/base/digitreverse.hpp	\|	6	+++---
M	include/kfr/base/expression.hpp	\|	63	+++++++++++++++++++++++++++++++--------------------------------
M	include/kfr/base/function.hpp	\|	6	+++---
M	include/kfr/base/gamma.hpp	\|	4	++--
M	include/kfr/base/generators.hpp	\|	40	++++++++++++++++++++--------------------
M	include/kfr/base/intrinsics.h	\|	6	+++---
M	include/kfr/base/kfr.h	\|	103	++-----------------------------------------------------------------------------
M	include/kfr/base/log_exp.hpp	\|	2	+-
M	include/kfr/base/logical.hpp	\|	63	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	include/kfr/base/memory.hpp	\|	22	+++++++++++-----------
M	include/kfr/base/min_max.hpp	\|	39	+++++++++++++++++++++++++++++++++++----
M	include/kfr/base/modzerobessel.hpp	\|	6	+++---
M	include/kfr/base/operators.hpp	\|	206	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	include/kfr/base/pointer.hpp	\|	42	++++++++++++++++++++++--------------------
M	include/kfr/base/random.hpp	\|	8	++++----
M	include/kfr/base/read_write.hpp	\|	34	+++++++++++++++++-----------------
M	include/kfr/base/reduce.hpp	\|	18	+++++++++---------
M	include/kfr/base/round.hpp	\|	24	++++++++++++++++++++++--
M	include/kfr/base/saturation.hpp	\|	31	+++++++++++++++++++++++++++++--
M	include/kfr/base/select.hpp	\|	70	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	include/kfr/base/shuffle.hpp	\|	127	++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M	include/kfr/base/sin_cos.hpp	\|	2	+-
M	include/kfr/base/sort.hpp	\|	4	++--
M	include/kfr/base/sqrt.hpp	\|	4	++--
M	include/kfr/base/types.hpp	\|	94	++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M	include/kfr/base/univector.hpp	\|	34	+++++++++++++++++-----------------
M	include/kfr/base/vec.hpp	\|	563	+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
M	include/kfr/cident.h	\|	532	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	include/kfr/cometa.hpp	\|	164	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	include/kfr/cometa/string.hpp	\|	120	++++++++++++++++++++++++++++++++++++++++----------------------------------------
A	include/kfr/dft.hpp	\|	31	+++++++++++++++++++++++++++++++
M	include/kfr/dft/bitrev.hpp	\|	6	+++---
M	include/kfr/dft/conv.hpp	\|	2	+-
M	include/kfr/dft/fft.hpp	\|	54	+++++++++++++++++++++++++++---------------------------
M	include/kfr/dft/ft.hpp	\|	124	+++++++++++++++++++++++++++++++++++++++----------------------------------------
A	include/kfr/dsp.hpp	\|	43	+++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/dsp/biquad.hpp	\|	144	++++++++++++++++++++++++++++++++++++++++---------------------------------------
M	include/kfr/dsp/biquad_design.hpp	\|	32	++++++++++++++++----------------
A	include/kfr/dsp/dcremove.hpp	\|	37	+++++++++++++++++++++++++++++++++++++
M	include/kfr/dsp/fir.hpp	\|	18	+++++++++++-------
M	include/kfr/dsp/fir_design.hpp	\|	8	++++----
M	include/kfr/dsp/fracdelay.hpp	\|	4	+---
M	include/kfr/dsp/goertzel.hpp	\|	147	+++++++++++++++++++++++++++++++++++++------------------------------------------
M	include/kfr/dsp/interpolation.hpp	\|	4	++--
A	include/kfr/dsp/mixdown.hpp	\|	62	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/dsp/resample.hpp	\|	188	+------------------------------------------------------------------------------
A	include/kfr/dsp/sample_rate_conversion.hpp	\|	227	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/dsp/units.hpp	\|	2	+-
M	include/kfr/dsp/waveshaper.hpp	\|	7	+++----
M	include/kfr/dsp/window.hpp	\|	91	++++++++++++++++++++++++++++++++++++++++---------------------------------------
A	include/kfr/io.hpp	\|	30	++++++++++++++++++++++++++++++
M	include/kfr/io/file.hpp	\|	4	++--
M	include/kfr/io/python_plot.hpp	\|	9	+++++++--
M	include/kfr/io/tostring.hpp	\|	6	+++++-
M	include/kfr/math.hpp	\|	23	+----------------------
M	include/kfr/version.hpp	\|	2	+-
M	sources.cmake	\|	75	+++++++++++++++++++++++++++++++++------------------------------------------
M	tests/CMakeLists.txt	\|	71	++++++++++++++++++++++++++++++++++++++++++++---------------------------
M	tests/complex_test.cpp	\|	17	++++++++++++++---
M	tests/conv_test.cpp	\|	8	++++----
M	tests/dft_test.cpp	\|	8	+++++++-
M	tests/empty_test.cpp	\|	2	+-
M	tests/intrinsic_test.cpp	\|	82	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
A	tests/multiarch.cpp	\|	56	++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	tests/multiarch_fir_avx.cpp	\|	18	++++++++++++++++++
A	tests/multiarch_fir_sse2.cpp	\|	18	++++++++++++++++++
M	tests/testo/testo.hpp	\|	22	+++++++++++-----------
M	tests/vec_test.cpp	\|	79	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
A	update-sources.py	\|	31	+++++++++++++++++++++++++++++++