KFR 3.0.5 - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit b6320ef16497bcbfe26f0bd107c3f4b9ca3278a3
parent da99a8186349038c9d15c3e3f15a1b7f6b5975d3
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Thu, 21 Feb 2019 01:26:26 +0000

KFR 3.0.5

Diffstat:
M .gitignore  | 2 +-
M CHANGELOG.md  | 22 ++++++++++++++++++++++
M CMakeLists.txt  | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
M azure-pipelines.yml  | 26 ++++++++++++++------------
M cmake/arm.cmake  | 4 +++-
A cmake/detect_cpu.cpp  | 10 ++++++++++
A cmake/target_set_arch.cmake  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D cmake/test_toolset/CMakeLists.txt  | 3 ---
M examples/CMakeLists.txt  | 23 +++++++++++------------
M examples/biquads.cpp  | 2 ++
M examples/fir.cpp  | 2 ++
M examples/sample_rate_conversion.cpp  | 2 ++
M examples/window.cpp  | 2 ++
M include/kfr/all.hpp  | 1 -
M include/kfr/base.hpp  | 33 ++++-----------------------------
D include/kfr/base/abs.hpp  | 49 -------------------------------------------------
D include/kfr/base/asin_acos.hpp  | 67 -------------------------------------------------------------------
D include/kfr/base/atan.hpp  | 107 -------------------------------------------------------------------------------
M include/kfr/base/basic_expressions.hpp  | 220 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
D include/kfr/base/bitwise.hpp  | 136 -------------------------------------------------------------------------------
D include/kfr/base/clamp.hpp  | 62 --------------------------------------------------------------
D include/kfr/base/comparison.hpp  | 149 -------------------------------------------------------------------------------
D include/kfr/base/compiletime.hpp  | 84 -------------------------------------------------------------------------------
D include/kfr/base/complex.hpp  | 967 -------------------------------------------------------------------------------
D include/kfr/base/constants.hpp  | 299 -------------------------------------------------------------------------------
M include/kfr/base/conversion.hpp  | 12 ++++++++----
D include/kfr/base/digitreverse.hpp  | 107 -------------------------------------------------------------------------------
M include/kfr/base/expression.hpp  | 219 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M include/kfr/base/filter.hpp  | 9 ++++++---
M include/kfr/base/fraction.hpp  | 3 +--
D include/kfr/base/function.hpp  | 268 -------------------------------------------------------------------------------
A include/kfr/base/function_expressions.hpp  | 30 ++++++++++++++++++++++++++++++
D include/kfr/base/gamma.hpp  | 60 ------------------------------------------------------------
M include/kfr/base/generators.hpp  | 86 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
D include/kfr/base/horizontal.hpp  | 119 -------------------------------------------------------------------------------
D include/kfr/base/hyperbolic.hpp  | 120 -------------------------------------------------------------------------------
D include/kfr/base/impl/abs.hpp  | 126 -------------------------------------------------------------------------------
D include/kfr/base/impl/asin_acos.hpp  | 58 ----------------------------------------------------------
D include/kfr/base/impl/atan.hpp  | 229 -------------------------------------------------------------------------------
D include/kfr/base/impl/clamp.hpp  | 56 --------------------------------------------------------
D include/kfr/base/impl/gamma.hpp  | 72 ------------------------------------------------------------------------
D include/kfr/base/impl/hyperbolic.hpp  | 100 -------------------------------------------------------------------------------
D include/kfr/base/impl/log_exp.hpp  | 315 -------------------------------------------------------------------------------
D include/kfr/base/impl/logical.hpp  | 289 ------------------------------------------------------------------------------
D include/kfr/base/impl/min_max.hpp  | 232 -------------------------------------------------------------------------------
D include/kfr/base/impl/modzerobessel.hpp  | 105 -------------------------------------------------------------------------------
D include/kfr/base/impl/round.hpp  | 255 -------------------------------------------------------------------------------
D include/kfr/base/impl/saturation.hpp  | 192 -------------------------------------------------------------------------------
D include/kfr/base/impl/select.hpp  | 261 -------------------------------------------------------------------------------
D include/kfr/base/impl/sin_cos.hpp  | 338 -------------------------------------------------------------------------------
D include/kfr/base/impl/sqrt.hpp  | 71 -----------------------------------------------------------------------
D include/kfr/base/impl/tan.hpp  | 141 -------------------------------------------------------------------------------
D include/kfr/base/intrinsics.h  | 18 ------------------
D include/kfr/base/kfr.h  | 46 ----------------------------------------------
D include/kfr/base/log_exp.hpp  | 229 -------------------------------------------------------------------------------
D include/kfr/base/logical.hpp  | 50 --------------------------------------------------
M include/kfr/base/memory.hpp  | 66 +++++++++++++++++++++++++++++++++++++-----------------------------
D include/kfr/base/min_max.hpp  | 107 -------------------------------------------------------------------------------
D include/kfr/base/modzerobessel.hpp  | 44 --------------------------------------------
D include/kfr/base/operators.hpp  | 552 -------------------------------------------------------------------------------
D include/kfr/base/platform.hpp  | 186 -------------------------------------------------------------------------------
M include/kfr/base/pointer.hpp  | 95 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M include/kfr/base/random.hpp  | 86 +++++++++++++++++++++++++++++++++++++++++--------------------------------------
D include/kfr/base/read_write.hpp  | 239 -------------------------------------------------------------------------------
M include/kfr/base/reduce.hpp  | 72 ++++++++++++++++++++++++++++++++++++++++--------------------------------
D include/kfr/base/round.hpp  | 158 -------------------------------------------------------------------------------
D include/kfr/base/saturation.hpp  | 62 --------------------------------------------------------------
D include/kfr/base/select.hpp  | 57 ---------------------------------------------------------
D include/kfr/base/shuffle.hpp  | 625 -------------------------------------------------------------------------------
D include/kfr/base/simd_clang.hpp  | 350 -------------------------------------------------------------------------------
D include/kfr/base/simd_intrin.hpp  | 392 -------------------------------------------------------------------------------
D include/kfr/base/simd_x86.hpp  | 272 -------------------------------------------------------------------------------
D include/kfr/base/sin_cos.hpp  | 315 -------------------------------------------------------------------------------
M include/kfr/base/small_buffer.hpp  | 7 +++----
M include/kfr/base/sort.hpp  | 18 +++++++++++-------
D include/kfr/base/specializations.i  | 109 -------------------------------------------------------------------------------
D include/kfr/base/sqrt.hpp  | 50 --------------------------------------------------
D include/kfr/base/tan.hpp  | 56 --------------------------------------------------------
D include/kfr/base/types.hpp  | 429 -------------------------------------------------------------------------------
M include/kfr/base/univector.hpp  | 133 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
D include/kfr/base/vec.hpp  | 1171 -------------------------------------------------------------------------------
M include/kfr/cident.h  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M include/kfr/cometa.hpp  | 560 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M include/kfr/cometa/array.hpp  | 65 +++++++++++++++++++++++++++++++++--------------------------------
M include/kfr/cometa/cstring.hpp  | 50 +++++++++++++++++++++++++-------------------------
M include/kfr/cometa/ctti.hpp  | 23 +++++++++++++++++------
M include/kfr/cometa/function.hpp  | 34 +++++++++++++++++-----------------
M include/kfr/cometa/named_arg.hpp  | 4 ++--
A include/kfr/cometa/numeric.hpp  | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/cometa/range.hpp  | 33 ++++++++++++++++++---------------
M include/kfr/cometa/result.hpp  | 13 +++++++------
M include/kfr/cometa/string.hpp  | 8 ++++----
D include/kfr/cpuid.hpp  | 26 --------------------------
D include/kfr/cpuid/cpuid.hpp  | 297 -------------------------------------------------------------------------------
D include/kfr/cpuid/cpuid_auto.hpp  | 60 ------------------------------------------------------------
D include/kfr/data/sincos.hpp  | 192 -------------------------------------------------------------------------------
M include/kfr/dft/cache.hpp  | 3 +++
M include/kfr/dft/convolution.hpp  | 28 +++++++++++++++-------------
R include/kfr/data/bitrev.hpp -> include/kfr/dft/data/bitrev.hpp  | 0 
A include/kfr/dft/data/sincos.hpp  | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/dft/fft.hpp  | 49 ++++++++++++++++++++++++++++++-------------------
M include/kfr/dft/impl/bitrev.hpp  | 45 ++++++++++++++++++++++++---------------------
M include/kfr/dft/impl/convolution-impl.cpp  | 26 ++++++++++++++------------
A include/kfr/dft/impl/dft-fft.hpp  | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/dft/impl/dft-impl.hpp  | 1311 +++++--------------------------------------------------------------------------
M include/kfr/dft/impl/dft-src.cpp  | 24 ++++++++++++------------
M include/kfr/dft/impl/dft-templates.hpp  | 18 ++++++------------
A include/kfr/dft/impl/fft-impl-f32.cpp  | 29 +++++++++++++++++++++++++++++
A include/kfr/dft/impl/fft-impl-f64.cpp  | 29 +++++++++++++++++++++++++++++
A include/kfr/dft/impl/fft-impl.hpp  | 1148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/dft/impl/fft-templates.hpp  | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/dft/impl/ft.hpp  | 462 +++++++++++++++++++++++++++++++++++++++++--------------------------------------
M include/kfr/dft/reference_dft.hpp  | 8 ++++----
M include/kfr/dsp.hpp  | 1 -
M include/kfr/dsp/biquad.hpp  | 128 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M include/kfr/dsp/biquad_design.hpp  | 21 ++++++++++++---------
M include/kfr/dsp/dcremove.hpp  | 7 +++++--
M include/kfr/dsp/delay.hpp  | 57 ++++++++++++++++++++++++++++++++-------------------------
M include/kfr/dsp/ebu.hpp  | 58 +++++++++++++++++++++++++++++++++++++++++++---------------
M include/kfr/dsp/fir.hpp  | 57 ++++++++++++++++++++++++++++++++-------------------------
M include/kfr/dsp/fir_design.hpp  | 67 +++++++++++++++++++++++++++++++++++--------------------------------
M include/kfr/dsp/fracdelay.hpp  | 8 ++++++--
M include/kfr/dsp/goertzel.hpp  | 24 ++++++++++++++----------
D include/kfr/dsp/interpolation.hpp  | 72 ------------------------------------------------------------------------
M include/kfr/dsp/mixdown.hpp  | 12 ++++++++----
M include/kfr/dsp/oscillators.hpp  | 96 +++++++++++++++++++++++++++++++++++++++++--------------------------------------
M include/kfr/dsp/sample_rate_conversion.hpp  | 140 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M include/kfr/dsp/speaker.hpp  | 5 ++++-
M include/kfr/dsp/special.hpp  | 20 ++++++++++++--------
M include/kfr/dsp/units.hpp  | 55 +++++++++++++++++++++++++++++--------------------------
M include/kfr/dsp/waveshaper.hpp  | 20 ++++++++++++--------
M include/kfr/dsp/weighting.hpp  | 34 +++++++++++++++++++---------------
M include/kfr/dsp/window.hpp  | 128 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
D include/kfr/ext/console_colors.hpp  | 162 -------------------------------------------------------------------------------
D include/kfr/ext/double_double.hpp  | 86 -------------------------------------------------------------------------------
M include/kfr/io/audiofile.hpp  | 54 +++++++++++++++++++++++++++++-------------------------
M include/kfr/io/file.hpp  | 27 ++++++++++++++++++++++++---
M include/kfr/io/impl/audiofile-impl.cpp  | 4 ++++
M include/kfr/io/python_plot.hpp  | 20 ++++++++++----------
M include/kfr/io/tostring.hpp  | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++---
A include/kfr/kfr.h  | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/math.hpp  | 22 +++++++++++++++++++++-
A include/kfr/math/abs.hpp  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/asin_acos.hpp  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/atan.hpp  | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/clamp.hpp  | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/compiletime.hpp  | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/complex_math.hpp  | 410 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/gamma.hpp  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/hyperbolic.hpp  | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/abs.hpp  | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/asin_acos.hpp  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/atan.hpp  | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/clamp.hpp  | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/gamma.hpp  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/hyperbolic.hpp  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/log_exp.hpp  | 335 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/logical.hpp  | 278 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/min_max.hpp  | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/modzerobessel.hpp  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/round.hpp  | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/saturation.hpp  | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/select.hpp  | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/sin_cos.hpp  | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/sqrt.hpp  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/impl/tan.hpp  | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/interpolation.hpp  | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/log_exp.hpp  | 232 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/logical.hpp  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/min_max.hpp  | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/modzerobessel.hpp  | 47 +++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/round.hpp  | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/saturation.hpp  | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/select.hpp  | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/sin_cos.hpp  | 318 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/sqrt.hpp  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/math/tan.hpp  | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/runtime.hpp  | 26 ++++++++++++++++++++++++++
A include/kfr/runtime/cpuid.hpp  | 300 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/runtime/cpuid_auto.hpp  | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd.hpp  | 36 ++++++++++++++++++++++++++++++++++++
A include/kfr/simd/comparison.hpp  | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/complex.hpp  | 468 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/constants.hpp  | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/digitreverse.hpp  | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/horizontal.hpp  | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/backend.hpp  | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/backend_clang.hpp  | 228 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/backend_generic.hpp  | 1080 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/basicoperators_clang.hpp  | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/basicoperators_generic.hpp  | 1674 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/function.hpp  | 295 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/intrinsics.h  | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/operators.hpp  | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/simd.hpp  | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/impl/specializations.i  | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/mask.hpp  | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/operators.hpp  | 810 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/platform.hpp  | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/read_write.hpp  | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/shuffle.hpp  | 569 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/types.hpp  | 372 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/simd/vec.hpp  | 1283 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/testo/assert.hpp  | 5 ++++-
M include/kfr/testo/comparison.hpp  | 38 ++++++++++++++++++++++++++++++++------
A include/kfr/testo/console_colors.hpp  | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A include/kfr/testo/double_double.hpp  | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/testo/testo.hpp  | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M include/kfr/version.hpp  | 3 +--
M sources.cmake  | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M tests/CMakeLists.txt  | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M tests/all_tests.cpp  | 20 +++++++++++++++++++-
A tests/all_tests_merged.cpp  | 25 +++++++++++++++++++++++++
A tests/asm_test.cpp  | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M tests/base_test.cpp  | 346 ++-----------------------------------------------------------------------------
M tests/complex_test.cpp  | 18 ++++++++----------
A tests/data/acos_double_fuzz  | 0 
A tests/data/acos_double_narrow  | 0 
A tests/data/acos_float_fuzz  | 0 
A tests/data/acos_float_narrow  | 0 
A tests/data/asin_double_fuzz  | 0 
A tests/data/asin_double_narrow  | 0 
A tests/data/asin_float_fuzz  | 0 
A tests/data/asin_float_narrow  | 0 
A tests/data/atan2_double_fuzz  | 0 
A tests/data/atan2_double_narrow  | 0 
A tests/data/atan2_float_fuzz  | 0 
A tests/data/atan2_float_narrow  | 0 
A tests/data/atan_double_fuzz  | 0 
A tests/data/atan_double_narrow  | 0 
A tests/data/atan_float_fuzz  | 0 
A tests/data/atan_float_narrow  | 0 
A tests/data/cbrt_double_fuzz  | 0 
A tests/data/cbrt_double_narrow  | 0 
A tests/data/cbrt_float_fuzz  | 0 
A tests/data/cbrt_float_narrow  | 0 
A tests/data/cos_double_fuzz  | 0 
A tests/data/cos_double_narrow  | 0 
A tests/data/cos_float_fuzz  | 0 
A tests/data/cos_float_narrow  | 0 
A tests/data/cosh_double_fuzz  | 0 
A tests/data/cosh_double_narrow  | 0 
A tests/data/cosh_float_fuzz  | 0 
A tests/data/cosh_float_narrow  | 0 
A tests/data/coth_double_fuzz  | 0 
A tests/data/coth_double_narrow  | 0 
A tests/data/coth_float_fuzz  | 0 
A tests/data/coth_float_narrow  | 0 
A tests/data/exp10_double_fuzz  | 0 
A tests/data/exp10_double_narrow  | 0 
A tests/data/exp10_float_fuzz  | 0 
A tests/data/exp10_float_narrow  | 0 
A tests/data/exp2_double_fuzz  | 0 
A tests/data/exp2_double_narrow  | 0 
A tests/data/exp2_float_fuzz  | 0 
A tests/data/exp2_float_narrow  | 0 
A tests/data/exp_double_fuzz  | 0 
A tests/data/exp_double_narrow  | 0 
A tests/data/exp_float_fuzz  | 0 
A tests/data/exp_float_narrow  | 0 
A tests/data/gamma_double_fuzz  | 0 
A tests/data/gamma_double_narrow  | 0 
A tests/data/gamma_float_fuzz  | 0 
A tests/data/gamma_float_narrow  | 0 
A tests/data/log10_double_fuzz  | 0 
A tests/data/log10_double_narrow  | 0 
A tests/data/log10_float_fuzz  | 0 
A tests/data/log10_float_narrow  | 0 
A tests/data/log2_double_fuzz  | 0 
A tests/data/log2_double_narrow  | 0 
A tests/data/log2_float_fuzz  | 0 
A tests/data/log2_float_narrow  | 0 
A tests/data/log_double_fuzz  | 0 
A tests/data/log_double_narrow  | 0 
A tests/data/log_float_fuzz  | 0 
A tests/data/log_float_narrow  | 0 
A tests/data/sin_double_fuzz  | 0 
A tests/data/sin_double_narrow  | 0 
A tests/data/sin_float_fuzz  | 0 
A tests/data/sin_float_narrow  | 0 
A tests/data/sinh_double_fuzz  | 0 
A tests/data/sinh_double_narrow  | 0 
A tests/data/sinh_float_fuzz  | 0 
A tests/data/sinh_float_narrow  | 0 
A tests/data/tan_double_fuzz  | 0 
A tests/data/tan_double_narrow  | 0 
A tests/data/tan_float_fuzz  | 0 
A tests/data/tan_float_narrow  | 0 
A tests/data/tanh_double_fuzz  | 0 
A tests/data/tanh_double_narrow  | 0 
A tests/data/tanh_float_fuzz  | 0 
A tests/data/tanh_float_narrow  | 0 
M tests/dft_test.cpp  | 111 +++++++++++++++++++++++++++++++++++++++++--------------------------------------
M tests/dsp_test.cpp  | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
D tests/ebu_test.cpp  | 122 -------------------------------------------------------------------------------
D tests/empty_test.cpp  | 5 -----
M tests/expression_test.cpp  | 31 +++++++++++++++++++++++--------
A tests/generate_data.cpp  | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M tests/intrinsic_test.cpp  | 290 ++++++++-----------------------------------------------------------------------
M tests/io_test.cpp  | 15 +++++++++------
M tests/mpfr/mpfrplus.hpp  | 25 ++++++++++++++++---------
M tests/multiarch.cpp  | 1 -
A tests/numeric_tests.hpp  | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D tests/resampler_test.cpp  | 37 -------------------------------------
D tests/transcendental_test.cpp  | 172 -------------------------------------------------------------------------------
A tests/unit/base/conversion.cpp  | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/unit/base/reduce.cpp  | 41 +++++++++++++++++++++++++++++++++++++++++
A tests/unit/math/abs.cpp  | 13 +++++++++++++
A tests/unit/math/asin_acos.cpp  | 18 ++++++++++++++++++
A tests/unit/math/atan.cpp  | 18 ++++++++++++++++++
A tests/unit/math/hyperbolic.cpp  | 21 +++++++++++++++++++++
A tests/unit/math/log_exp.cpp  | 23 +++++++++++++++++++++++
A tests/unit/math/min_max.cpp  | 39 +++++++++++++++++++++++++++++++++++++++
A tests/unit/math/round.cpp  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/unit/math/select.cpp  | 27 +++++++++++++++++++++++++++
A tests/unit/math/sin_cos.cpp  | 17 +++++++++++++++++
A tests/unit/math/tan.cpp  | 16 ++++++++++++++++
A tests/unit/simd/complex.cpp  | 33 +++++++++++++++++++++++++++++++++
A tests/unit/simd/operators.cpp  | 220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/unit/simd/shuffle.cpp  | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tests/unit/simd/vec.cpp  | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A tools/CMakeLists.txt  | 28 ++++++++++++++++++++++++++++
A tools/ebu_test.cpp  | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R examples/sample_rate_converter.cpp -> tools/sample_rate_converter.cpp  | 0 
M update-sources.py  | 32 ++++++++++++++++++++------------

325 files changed, 21331 insertions(+), 15841 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -77,7 +77,7 @@ var/
 venv/
 
 
-# Sphinx documentation
+# Documentation
 docs/
 mkdocs/
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,26 @@
 # Changelog
 
+## 3.0.5
+
+2019-02-21
+
+#### Added
+
+- DFT speeds have been improved by up to 15% on most modern cpus
+- Support for MSVC 2017
+- Support for GCC 7.3
+- Support for GCC 8.2
+- Support for resampling complex vectors (Thanks to https://github.com/ermito)
+- Tests for various math functions no longer depend on MPFR
+
+#### Changed
+
+- Testo now allocates much less memory during long tests (x3 less than previously)
+
+#### Fixed
+
+- Building generators (Thanks to https://github.com/ermito)
+
 ## 3.0.4
 
 2019-01-08
@@ -9,6 +30,7 @@
 #### Changed
 
 - KFR_READCYCLECOUNTER may be redefined to point to any function returning (pseudo-)random value
+- Ability to disable random number initialization functions
 
 #### Fixed
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,15 +15,33 @@
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.1)
+
+message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS})
 
 set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS}" CACHE STRING "compile flags" FORCE)
 
+message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS})
+
 project(kfr CXX)
 
-message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} ${CMAKE_CXX_COMPILER} ")
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
 
+message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} ${CMAKE_CXX_COMPILER} ")
 message(STATUS CMAKE_SYSTEM_PROCESSOR = ${CMAKE_SYSTEM_PROCESSOR})
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+    set (X86 TRUE)
+else ()
+    set (X86 FALSE)
+endif ()
+
+if (X86)
+    message(STATUS X86)
+endif ()
+
 if (MSVC)
     message(STATUS MSVC)
 endif()
@@ -34,77 +52,128 @@ else()
     set(CLANG 0)
 endif()
 
-# Include list of source files
+# Include autogenerated list of source files
 include(sources.cmake)
 
-add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
+option(ENABLE_TESTS "Enable tests and examples" OFF)
+if (CLANG)
+    option(ENABLE_DFT "Enable DFT and related algorithms. Requires Clang" ON)
+endif ()
+option(ENABLE_ASMTEST "Enable writing disassembly" OFF)
+option(REGENERATE_TESTS "Regenerate auto tests" OFF)
+option(DISABLE_CLANG_EXTENSIONS "Disable Clang vector extensions" OFF)
+option(KFR_EXTENDED_TESTS "Extended tests (up to hour)" OFF)
+mark_as_advanced(ENABLE_ASMTEST)
+mark_as_advanced(REGENERATE_TESTS)
+mark_as_advanced(DISABLE_CLANG_EXTENSIONS)
+
+if (NOT CPU_ARCH)
+    set(CPU_ARCH avx2)
+endif ()
 
-option(ENABLE_TESTS "Enable tests and examples. This changes many compiler flags" OFF)
-option(ENABLE_DFT "Enable DFT and related algorithms" ON)
+if (CPU_ARCH STREQUAL "detect")
+    message(STATUS "Detecting native cpu...")
+    try_run(
+            RUN_RESULT COMPILE_RESULT
+            "${CMAKE_BINARY_DIR}/tmpdir"
+            ${CMAKE_SOURCE_DIR}/cmake/detect_cpu.cpp
+            CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CMAKE_SOURCE_DIR}/include"
+            COMPILE_OUTPUT_VARIABLE COMPILE_OUT
+            RUN_OUTPUT_VARIABLE RUN_OUT
+    )
+    if (COMPILE_RESULT AND RUN_RESULT EQUAL 0)
+        message(STATUS DETECTED_CPU = ${RUN_OUT})
+        set(CPU_ARCH ${RUN_OUT})
+    else()
+        message(STATUS COMPILE_RESULT = ${COMPILE_RESULT})
+        message(STATUS RUN_RESULT = ${RUN_RESULT})
+        message(STATUS COMPILE_OUT = ${COMPILE_OUT})
+        message(STATUS RUN_OUT = ${RUN_OUT})
+    endif ()
+endif ()
 
-set(KFR_DFT_SRC
-        ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-src.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/dft_c.h
-        ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f32.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f64.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/convolution-impl.cpp)
+include(cmake/target_set_arch.cmake)
 
-set(KFR_IO_SRC
-        ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/io/impl/audiofile-impl.cpp)
+add_library(use_arch INTERFACE)
+target_set_arch(use_arch INTERFACE ${CPU_ARCH})
 
-if (ENABLE_TESTS)
+if (WIN32)
+    add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+    add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
+endif()
 
-    if (IOS)
-        set(STD_LIB)
-    else ()
-        set(STD_LIB stdc++)
-    endif ()
+if (IOS)
+    set(STD_LIB)
+else ()
+    set(STD_LIB stdc++)
+endif ()
+
+# KFR library
+add_library(kfr INTERFACE)
+target_sources(kfr INTERFACE ${KFR_SRC})
+target_include_directories(kfr INTERFACE include)
+target_compile_options(kfr INTERFACE "$<$<CONFIG:DEBUG>:-DKFR_DEBUG>")
+if (NOT MSVC)
+    target_compile_options(kfr INTERFACE -mstackrealign)
+endif ()
+if (MSVC)
+    target_compile_options(kfr INTERFACE -bigobj)
+else ()
+    target_link_libraries(kfr INTERFACE ${STD_LIB} pthread m)
+endif ()
+if (DISABLE_CLANG_EXTENSIONS)
+    target_compile_definitions(kfr INTERFACE -DCMT_DISABLE_CLANG_EXT)
+endif ()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(kfr INTERFACE -Wno-ignored-qualifiers)
+endif ()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    target_compile_options(kfr INTERFACE -Wno-c++1z-extensions)
+endif ()
 
-    # Binary output directories
-    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
-    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
+if (NOT ENABLE_DFT)
+    target_compile_definitions(kfr INTERFACE -DKFR_NO_DFT)
+endif ()
+if (KFR_EXTENDED_TESTS)
+    target_compile_definitions(kfr INTERFACE -DKFR_EXTENDED_TESTS)
+endif()
 
-    add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+message(STATUS CPU_ARCH=${CPU_ARCH})
 
-    if (NOT MSVC OR CLANG)
-        # Enable C++14, disable exceptions and rtti
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            add_compile_options(-std=gnu++1y)
-        else ()
-            add_compile_options(-std=c++1y)
-        endif ()
-        add_compile_options(-fno-exceptions -fno-rtti )
-        if (NOT ARCH_FLAGS)
-            add_compile_options(-march=native)
-            message(STATUS "Building for native cpu")
-            if(WIN32)
-                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mstackrealign -fno-asynchronous-unwind-tables")
-            endif()
-        else ()
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_FLAGS}")
-        endif ()
-        if(NOT MSVC)
-            link_libraries(${STD_LIB} pthread m)
-        endif()
-    else ()
-        # Disable exceptions
-        add_compile_options(/EHsc /D_HAS_EXCEPTIONS=0 /D_CRT_SECURE_NO_WARNINGS=1)
-        add_compile_options(/arch:AVX)
+if (ENABLE_TESTS)
+
+    if (MSVC)
+    else()
+        # disable exceptions and rtti
+        add_compile_options(-fno-exceptions -fno-rtti -fno-asynchronous-unwind-tables)
     endif ()
 
     add_subdirectory(examples)
     add_subdirectory(tests)
+    add_subdirectory(tools)
 endif ()
 
-add_library(kfr INTERFACE)
-target_sources(kfr INTERFACE ${KFR_SRC})
-target_include_directories(kfr INTERFACE include)
-
 if (ENABLE_DFT)
+    if (NOT CLANG)
+        message(FATAL_ERROR "Clang compiler is required for DFT in KFR. See README.md for more information")
+    endif()
     add_library(kfr_dft ${KFR_DFT_SRC})
-    target_link_libraries(kfr_dft kfr)
+    target_link_libraries(kfr_dft kfr use_arch)
+    if (MSVC)
+        target_compile_options(kfr_dft PRIVATE -fp:fast)
+    else()
+        target_compile_options(kfr_dft PRIVATE -ffast-math)
+    endif()
 endif()
 
 add_library(kfr_io ${KFR_IO_SRC})
 target_link_libraries(kfr_io kfr)
 target_compile_definitions(kfr_io PUBLIC KFR_ENABLE_FLAC=1)
+
+install(TARGETS kfr kfr_io ARCHIVE DESTINATION lib)
+
+if (ENABLE_DFT)
+    install(TARGETS kfr_dft ARCHIVE DESTINATION lib)
+endif ()
+
+install(DIRECTORY include/kfr DESTINATION include)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -6,7 +6,7 @@ jobs:
   - bash: |
       set -e
       sudo apt-get update && sudo apt-get install -y ninja-build libmpfr-dev
-      ci/run.sh build-release -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+      ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
 
 - job: Linux_x86_64_Clang_Debug
   pool:
@@ -15,7 +15,7 @@ jobs:
   - bash: |
       set -e
       sudo apt-get update && sudo apt-get install -y ninja-build libmpfr-dev
-      ci/run.sh build-debug   -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug
+      ci/run.sh build-debug  -DCPU_ARCH=detect  -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug
 
 - job: Linux_ARM_Clang_Release
   pool:
@@ -46,7 +46,7 @@ jobs:
       set -e
       /bin/bash -c "sudo xcode-select -s /Applications/Xcode_$(XCODE_VER).app/Contents/Developer"
       brew install ninja
-      ci/run.sh build-release -DCMAKE_BUILD_TYPE=Release
+      ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_BUILD_TYPE=Release
 
 - job: macOS_x86_64_Clang_Debug
   strategy:
@@ -62,7 +62,7 @@ jobs:
       set -e
       /bin/bash -c "sudo xcode-select -s /Applications/Xcode_$(XCODE_VER).app/Contents/Developer"
       brew install ninja
-      ci/run.sh build-release -DCMAKE_BUILD_TYPE=Release
+      ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_BUILD_TYPE=Release
 
 - job: Windows_MSVC_x86_64_Clang_Release
   pool:
@@ -73,7 +73,7 @@ jobs:
       call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
       set PATH=%PATH:C:\tools\mingw64\bin;=%
       set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=%
-      ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release
+      ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release
 
 - job: Windows_MSVC_x86_Clang_Release
   pool:
@@ -84,7 +84,7 @@ jobs:
       call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars32.bat"
       set PATH=%PATH:C:\tools\mingw64\bin;=%
       set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=%
-      ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release
+      ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release
 
 - job: Windows_MSVC_x86_Clang_Debug
   pool:
@@ -95,32 +95,34 @@ jobs:
       call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars32.bat"
       set PATH=%PATH:C:\tools\mingw64\bin;=%
       set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=%
-      ci\run.cmd build-debug -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Debug
+      ci\run.cmd build-debug -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Debug
 
 - job: Windows_MinGW_x86_64_AVX512_Clang_Release
   pool: WIN-AVX512
   steps:
   - script: |
       set PATH=C:\msys64\mingw64\bin;C:\msys64\usr\local\bin;C:\msys64\usr\bin;%PATH%
-      bash -c "ci/run.sh build-release -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Release"
+      bash -c "ci/run.sh build-release -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Release"
 
 - job: Windows_MinGW_x86_64_AVX512_Clang_Debug
   pool: WIN-AVX512
   steps:
   - script: |
       set PATH=C:\msys64\mingw64\bin;C:\msys64\usr\local\bin;C:\msys64\usr\bin;%PATH%
-      bash -c "ci/run.sh build-debug   -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Debug"
-      
+      bash -c "ci/run.sh build-debug   -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Debug"
+
 - job: Windows_MSVC_x86_64_AVX512_Clang_Release
   pool: WIN-AVX512
   steps:
   - script: |
       call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
-      ci\run.cmd build-release -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DARCH_FLAGS="-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl" -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release
+      set CXXFLAGS=-m64
+      ci\run.cmd build-release -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DCPU_ARCH=avx512 -DCMAKE_BUILD_TYPE=Release
 
 - job: Windows_MSVC_x86_64_AVX512_Clang_Debug
   pool: WIN-AVX512
   steps:
   - script: |
       call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
-      ci\run.cmd build-debug   -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DARCH_FLAGS="-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl" -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Debug
+      set CXXFLAGS=-m64
+      ci\run.cmd build-debug   -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DCPU_ARCH=avx512 -DCMAKE_BUILD_TYPE=Debug
diff --git a/cmake/arm.cmake b/cmake/arm.cmake
@@ -11,7 +11,9 @@ set (CMAKE_CXX_COMPILER_WORKS TRUE)
 set (CMAKE_C_COMPILER_WORKS TRUE)
 
 set (ARM_ROOT "/usr/arm-linux-gnueabihf/include")
-set (GCC_VER 5.4.0)
+if (NOT GCC_VER)
+    set (GCC_VER 5.4.0)
+endif ()
 set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/arm-linux-gnueabihf -isystem ${ARM_ROOT}")
 
 set (ARM_COMMON_FLAGS "-target arm-linux-gnueabihf -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static")
diff --git a/cmake/detect_cpu.cpp b/cmake/detect_cpu.cpp
@@ -0,0 +1,9 @@
+#include <kfr/runtime/cpuid.hpp>
+
+using namespace kfr;
+
+int main()
+{
+    cpu_t cpu = kfr::internal_generic::detect_cpu();
+    printf("%s", cpu_name(cpu));
+}
+\ No newline at end of file
diff --git a/cmake/target_set_arch.cmake b/cmake/target_set_arch.cmake
@@ -0,0 +1,56 @@
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+
+    set(ARCH_FLAGS_GNU_generic -DCMT_FORCE_GENERIC_CPU)
+    set(ARCH_FLAGS_GNU_sse2    -msse2)
+    set(ARCH_FLAGS_GNU_sse3    -msse3)
+    set(ARCH_FLAGS_GNU_ssse3   -mssse3)
+    set(ARCH_FLAGS_GNU_sse41   -msse4.1)
+    set(ARCH_FLAGS_GNU_avx     -msse4.1 -mavx)
+    set(ARCH_FLAGS_GNU_avx2    -msse4.1 -mavx2 -mfma)
+    set(ARCH_FLAGS_GNU_avx512  -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl)
+
+    if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+        # SSE2 is part of x86_64
+        set(ARCH_FLAG_MS_SSE2)
+    else()
+        set(ARCH_FLAG_MS_SSE2 -arch:SSE2)
+    endif()
+
+    set(ARCH_FLAGS_MS_generic  ${ARCH_FLAG_MS_SSE2} -DCMT_FORCE_GENERIC_CPU)
+    set(ARCH_FLAGS_MS_sse2     ${ARCH_FLAG_MS_SSE2})
+    set(ARCH_FLAGS_MS_sse3     ${ARCH_FLAG_MS_SSE2} -D__SSE3__)
+    set(ARCH_FLAGS_MS_ssse3    ${ARCH_FLAG_MS_SSE2} -D__SSSE3__)
+    set(ARCH_FLAGS_MS_sse41    ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__)
+    set(ARCH_FLAGS_MS_avx      -arch:AVX)
+    set(ARCH_FLAGS_MS_avx2     -arch:AVX2)
+    set(ARCH_FLAGS_MS_avx512   -arch:AVX512)
+
+    function(target_set_arch TARGET MODE ARCH)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+            set(CLANG 1)
+        else ()
+            set(CLANG 0)
+        endif()
+        message(STATUS "target_set_arch(${TARGET} ${MODE} ${ARCH})")
+        if (CLANG OR NOT MSVC)
+            # Reset previous arch flags
+            if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+                target_compile_options(${TARGET} ${MODE} -mno-sse3)
+            else()
+                target_compile_options(${TARGET} ${MODE} -mno-sse)
+            endif()
+        endif ()
+        if (MSVC AND NOT CLANG)
+            target_compile_options(${TARGET} ${MODE} ${ARCH_FLAGS_MS_${ARCH}})
+        else()
+            target_compile_options(${TARGET} ${MODE} ${ARCH_FLAGS_GNU_${ARCH}})
+        endif ()
+    endfunction()
+
+else()
+
+    function(target_set_arch TARGET MODE ARCH)
+    endfunction()
+
+endif ()
diff --git a/cmake/test_toolset/CMakeLists.txt b/cmake/test_toolset/CMakeLists.txt
@@ -1,3 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(test_toolset CXX)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -15,33 +15,32 @@
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.1)
 
-file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/svg)
+# Binary output directories
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
 
-include_directories(../include)
+file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/svg)
 
 add_executable(biquads biquads.cpp)
-target_link_libraries(biquads kfr)
+target_link_libraries(biquads kfr use_arch)
 
 add_executable(window window.cpp)
-target_link_libraries(window kfr)
+target_link_libraries(window kfr use_arch)
 
 add_executable(fir fir.cpp)
+target_link_libraries(fir kfr use_arch)
 
-target_link_libraries(fir kfr)
 if (ENABLE_DFT)
-    target_link_libraries(fir kfr_dft)
+    target_link_libraries(fir kfr_dft use_arch)
     target_compile_definitions(fir PRIVATE -DHAVE_DFT)
 endif ()
 
 add_executable(sample_rate_conversion sample_rate_conversion.cpp)
-target_link_libraries(sample_rate_conversion kfr kfr_io)
-
-add_executable(sample_rate_converter sample_rate_converter.cpp)
-target_link_libraries(sample_rate_converter kfr kfr_io)
+target_link_libraries(sample_rate_conversion kfr kfr_io use_arch)
 
 if (ENABLE_DFT)
     add_executable(dft dft.cpp)
-    target_link_libraries(dft kfr kfr_dft)
+    target_link_libraries(dft kfr kfr_dft use_arch)
 endif ()
diff --git a/examples/biquads.cpp b/examples/biquads.cpp
@@ -94,5 +94,7 @@ int main()
     plot_save("biquad_filter_lowpass", output,
               options + ", title='Biquad Low pass filter (0.2, 0.9) (using biquad_filter)'");
 
+    println("SVG plots have been saved to svg directory");
+
     return 0;
 }
diff --git a/examples/fir.cpp b/examples/fir.cpp
@@ -148,5 +148,7 @@ int main()
 #endif
 #endif
 
+    println("SVG plots have been saved to svg directory");
+
     return 0;
 }
diff --git a/examples/sample_rate_conversion.cpp b/examples/sample_rate_conversion.cpp
@@ -72,5 +72,7 @@ int main()
         plot_save("audio_draft_quality", "audio_draft_quality.wav", "");
     }
 
+    println("SVG plots have been saved to svg directory");
+
     return 0;
 }
diff --git a/examples/window.cpp b/examples/window.cpp
@@ -57,5 +57,7 @@ int main()
     output = window_kaiser(output.size(), 2.5);
     plot_save("window_kaiser", output, options + ", title='Kaiser window'");
 
+    println("SVG plots have been saved to svg directory");
+
     return 0;
 }
diff --git a/include/kfr/all.hpp b/include/kfr/all.hpp
@@ -22,7 +22,6 @@
  */
 
 #include "base.hpp"
-#include "cpuid.hpp"
 #include "dft.hpp"
 #include "dsp.hpp"
 #include "io.hpp"
diff --git a/include/kfr/base.hpp b/include/kfr/base.hpp
@@ -22,44 +22,19 @@
  */
 #pragma once
 
-#include "base/abs.hpp"
-#include "base/asin_acos.hpp"
-#include "base/atan.hpp"
+#include "math.hpp"
+
 #include "base/basic_expressions.hpp"
-#include "base/clamp.hpp"
-#include "base/comparison.hpp"
-#include "base/compiletime.hpp"
-#include "base/complex.hpp"
-#include "base/constants.hpp"
 #include "base/conversion.hpp"
-#include "base/digitreverse.hpp"
 #include "base/expression.hpp"
 #include "base/filter.hpp"
-#include "base/function.hpp"
-#include "base/gamma.hpp"
+#include "base/fraction.hpp"
+#include "base/function_expressions.hpp"
 #include "base/generators.hpp"
-#include "base/horizontal.hpp"
-#include "base/hyperbolic.hpp"
-#include "base/log_exp.hpp"
-#include "base/logical.hpp"
 #include "base/memory.hpp"
-#include "base/min_max.hpp"
-#include "base/modzerobessel.hpp"
-#include "base/operators.hpp"
 #include "base/pointer.hpp"
 #include "base/random.hpp"
-#include "base/read_write.hpp"
 #include "base/reduce.hpp"
-#include "base/round.hpp"
-#include "base/saturation.hpp"
-#include "base/select.hpp"
-#include "base/shuffle.hpp"
-#include "base/sin_cos.hpp"
 #include "base/small_buffer.hpp"
 #include "base/sort.hpp"
-#include "base/sqrt.hpp"
-#include "base/tan.hpp"
-#include "base/types.hpp"
 #include "base/univector.hpp"
-#include "base/vec.hpp"
-#include "version.hpp"
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -1,49 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/abs.hpp"
-
-namespace kfr
-{
-/**
- * @brief Returns the absolute value of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 abs(const T1& x)
-{
-    return intrinsics::abs(x);
-}
-
-/**
- * @brief Returns template expression that returns the absolute value of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::abs, E1> abs(E1&& x)
-{
-    return { fn::abs(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp
@@ -1,67 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/asin_acos.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the arc sine of x. The returned angle is in the range \f$-\pi/2\f$ through \f$\pi/2\f$.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> asin(const T1& x)
-{
-    return intrinsics::asin(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc sine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::asin, E1> asin(E1&& x)
-{
-    return { fn::asin(), std::forward<E1>(x) };
-}
-/**
- * @brief Returns the arc cosine of x. The returned angle is in the range 0 through \f$\pi\f$.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> acos(const T1& x)
-{
-    return intrinsics::acos(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc cosine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::acos, E1> acos(E1&& x)
-{
-    return { fn::acos(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -1,107 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/atan.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the arc tangent of x. The returned angle is in the range \f$-\pi/2\f$ through
- * \f$\pi/2\f$.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> atan(const T1& x)
-{
-    return intrinsics::atan(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::atan, E1> atan(E1&& x)
-{
-    return { fn::atan(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the arc tangent of the x, expressed in degrees. The returned angle is in the range -90
- * through 90.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> atandeg(const T1& x)
-{
-    return intrinsics::atandeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of the x, expressed in degrees.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::atandeg, E1> atandeg(E1&& x)
-{
-    return { fn::atandeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC common_type<T1, T2> atan2(const T1& x, const T2& y)
-{
-    return intrinsics::atan2(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of y/x.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::atan2, E1, E2> atan2(E1&& x, E2&& y)
-{
-    return { fn::atan2(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the arc tangent of y/x (expressed in degrees) using the signs of arguments to determine the
- * correct quadrant.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC common_type<T1, T2> atan2deg(const T1& x, const T2& y)
-{
-    return intrinsics::atan2deg(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of y/x (expressed in degrees).
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::atan2deg, E1, E2> atan2deg(E1&& x, E2&& y)
-{
-    return { fn::atan2deg(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -25,27 +25,51 @@
  */
 #pragma once
 
-#include "operators.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
 #include "univector.hpp"
-#include "vec.hpp"
 #include <algorithm>
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
+namespace internal
+{
+template <size_t width, typename Fn>
+KFR_INTRINSIC void block_process_impl(size_t& i, size_t size, Fn&& fn)
+{
+    CMT_LOOP_NOUNROLL
+    for (; i < size / width * width; i += width)
+        fn(i, csize_t<width>());
+}
+} // namespace internal
+
+template <size_t... widths, typename Fn>
+KFR_INTRINSIC void block_process(size_t size, csizes_t<widths...>, Fn&& fn)
+{
+    size_t i = 0;
+    swallow{ (internal::block_process_impl<widths>(i, size, std::forward<Fn>(fn)), 0)... };
+}
 
 namespace internal
 {
 
 template <typename To, typename E>
-struct expression_convert : expression_base<E>
+struct expression_convert : expression_with_arguments<E>
 {
     using value_type = To;
-    CMT_INLINE expression_convert(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {}
+    KFR_MEM_INTRINSIC expression_convert(E&& expr) CMT_NOEXCEPT
+        : expression_with_arguments<E>(std::forward<E>(expr))
+    {
+    }
 
     template <size_t N>
-    CMT_INLINE vec<To, N> operator()(cinput_t input, size_t index, vec_t<To, N>) const
+    friend KFR_INTRINSIC vec<To, N> get_elements(const expression_convert& self, cinput_t input,
+                                                     size_t index, vec_shape<To, N>)
     {
-        return this->argument_first(input, index, vec_t<To, N>());
+        return self.argument_first(input, index, vec_shape<To, N>());
     }
 };
 
@@ -56,7 +80,7 @@ struct expression_iterator
     struct iterator
     {
         T operator*() const { return get(); }
-        T get() const { return expr.e1(cinput, position, vec_t<T, 1>())[0]; }
+        T get() const { return get_elements(expr.e1, cinput, position, vec_shape<T, 1>()).front(); }
         iterator& operator++()
         {
             ++position;
@@ -79,13 +103,13 @@ struct expression_iterator
 } // namespace internal
 
 template <typename To, typename E>
-CMT_INLINE internal::expression_convert<To, E> convert(E&& expr)
+KFR_INTRINSIC internal::expression_convert<To, E> convert(E&& expr)
 {
     return internal::expression_convert<To, E>(std::forward<E>(expr));
 }
 
 template <typename E1, typename T = value_type_of<E1>>
-CMT_INLINE internal::expression_iterator<T, E1> to_iterator(E1&& e1)
+KFR_INTRINSIC internal::expression_iterator<T, E1> to_iterator(E1&& e1)
 {
     return internal::expression_iterator<T, E1>(std::forward<E1>(e1));
 }
@@ -99,30 +123,30 @@ inline auto sequence(const Ts&... list)
 }
 
 template <typename T = int>
-CMT_INLINE auto zeros()
+KFR_INTRINSIC auto zeros()
 {
     return lambda<T>([](cinput_t, size_t, auto x) { return zerovector(x); });
 }
 
 template <typename T = int>
-CMT_INLINE auto ones()
+KFR_INTRINSIC auto ones()
 {
-    return lambda<T>([](cinput_t, size_t, auto x) { return 1; });
+    return lambda<T>([](cinput_t, size_t, auto) { return 1; });
 }
 
 template <typename T = int>
-CMT_INLINE auto counter()
+KFR_INTRINSIC auto counter()
 {
     return lambda<T>([](cinput_t, size_t index, auto x) { return enumerate(x) + index; });
 }
 
 template <typename T1>
-CMT_INLINE auto counter(T1 start)
+KFR_INTRINSIC auto counter(T1 start)
 {
     return lambda<T1>([start](cinput_t, size_t index, auto x) { return enumerate(x) + index + start; });
 }
 template <typename T1, typename T2>
-CMT_INLINE auto counter(T1 start, T2 step)
+KFR_INTRINSIC auto counter(T1 start, T2 step)
 {
     return lambda<common_type<T1, T2>>(
         [start, step](cinput_t, size_t index, auto x) { return (enumerate(x) + index) * step + start; });
@@ -149,10 +173,10 @@ namespace internal
 template <typename T, typename E1>
 struct expression_reader
 {
-    constexpr expression_reader(E1&& e1) noexcept : e1(std::forward<E1>(e1)) {}
+    constexpr expression_reader(E1&& e1) CMT_NOEXCEPT : e1(std::forward<E1>(e1)) {}
     T read() const
     {
-        const T result = e1(cinput, m_position, vec_t<T, 1>());
+        const T result = get_elements(e1, cinput, m_position, vec_shape<T, 1>());
         m_position++;
         return result;
     }
@@ -162,7 +186,7 @@ struct expression_reader
 template <typename T, typename E1>
 struct expression_writer
 {
-    constexpr expression_writer(E1&& e1) noexcept : e1(std::forward<E1>(e1)) {}
+    constexpr expression_writer(E1&& e1) CMT_NOEXCEPT : e1(std::forward<E1>(e1)) {}
     template <typename U>
     void write(U value)
     {
@@ -192,19 +216,20 @@ namespace internal
 {
 
 template <typename E1>
-struct expression_slice : expression_base<E1>
+struct expression_slice : expression_with_arguments<E1>
 {
     using value_type = value_type_of<E1>;
     using T          = value_type;
     expression_slice(E1&& e1, size_t start, size_t size)
-        : expression_base<E1>(std::forward<E1>(e1)), start(start),
+        : expression_with_arguments<E1>(std::forward<E1>(e1)), start(start),
           new_size(size_min(size, size_sub(std::get<0>(this->args).size(), start)))
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_slice& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return this->argument_first(cinput, index + start, y);
+        return self.argument_first(cinput, index + self.start, y);
     }
     size_t size() const { return new_size; }
     size_t start;
@@ -212,15 +237,16 @@ struct expression_slice : expression_base<E1>
 };
 
 template <typename E1>
-struct expression_reverse : expression_base<E1>
+struct expression_reverse : expression_with_arguments<E1>
 {
     using value_type = value_type_of<E1>;
     using T          = value_type;
-    expression_reverse(E1&& e1) : expression_base<E1>(std::forward<E1>(e1)), expr_size(e1.size()) {}
+    expression_reverse(E1&& e1) : expression_with_arguments<E1>(std::forward<E1>(e1)), expr_size(e1.size()) {}
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_reverse& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return reverse(this->argument_first(cinput, expr_size - index - N, y));
+        return reverse(self.argument_first(cinput, self.expr_size - index - N, y));
     }
     size_t size() const { return expr_size; }
     size_t expr_size;
@@ -234,7 +260,7 @@ struct expression_linspace<T, false> : input_expression
 {
     using value_type = T;
 
-    CMT_INLINE constexpr size_t size() const noexcept { return truncate_size; }
+    KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT { return truncate_size; }
 
     expression_linspace(T start, T stop, size_t size, bool endpoint = false, bool truncate = false)
         : start(start), offset((stop - start) / T(endpoint ? size - 1 : size)),
@@ -248,10 +274,11 @@ struct expression_linspace<T, false> : input_expression
     }
 
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_linspace& self, cinput_t, size_t index,
+                                                    vec_shape<T, N> x)
     {
         using TI = itype<T>;
-        return T(start) + (enumerate(x) + cast<T>(cast<TI>(index))) * T(offset);
+        return T(self.start) + (enumerate(x) + static_cast<T>(static_cast<TI>(index))) * T(self.offset);
     }
 
     T start;
@@ -264,7 +291,7 @@ struct expression_linspace<T, true> : input_expression
 {
     using value_type = T;
 
-    CMT_INLINE constexpr size_t size() const noexcept { return truncate_size; }
+    KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT { return truncate_size; }
 
     expression_linspace(T start, T stop, size_t size, bool endpoint = false, bool truncate = false)
         : start(start), stop(stop), invsize(1.0 / T(endpoint ? size - 1 : size)),
@@ -278,13 +305,15 @@ struct expression_linspace<T, true> : input_expression
     }
 
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_linspace& self, cinput_t, size_t index,
+                                                    vec_shape<T, N> x)
     {
         using TI = itype<T>;
-        return mix((enumerate(x) + cast<T>(cast<TI>(index))) * invsize, cast<T>(start), cast<T>(stop));
+        return mix((enumerate(x) + static_cast<T>(static_cast<TI>(index))) * self.invsize, self.start,
+                   self.stop);
     }
     template <typename U, size_t N>
-    CMT_INLINE static vec<U, N> mix(const vec<U, N>& t, U x, U y)
+    KFR_MEM_INTRINSIC static vec<U, N> mix(const vec<U, N>& t, U x, U y)
     {
         return (U(1.0) - t) * x + t * y;
     }
@@ -296,16 +325,16 @@ struct expression_linspace<T, true> : input_expression
 };
 
 template <typename... E>
-struct expression_sequence : expression_base<E...>
+struct expression_sequence : expression_with_arguments<E...>
 {
 public:
-    using base = expression_base<E...>;
+    using base = expression_with_arguments<E...>;
 
     using value_type = common_type<value_type_of<E>...>;
     using T          = value_type;
 
     template <typename... Expr_>
-    CMT_INLINE expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) noexcept
+    KFR_MEM_INTRINSIC expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) CMT_NOEXCEPT
         : base(std::forward<Expr_>(expr)...)
     {
         std::copy(std::begin(segments), std::end(segments), this->segments.begin() + 1);
@@ -314,20 +343,22 @@ public:
     }
 
     template <size_t N>
-    CMT_NOINLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_sequence& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        std::size_t sindex = size_t(std::upper_bound(std::begin(segments), std::end(segments), index) - 1 -
-                                    std::begin(segments));
-        if (segments[sindex + 1] - index >= N)
-            return get(cinput, index, sindex - 1, y);
+        std::size_t sindex =
+            size_t(std::upper_bound(std::begin(self.segments), std::end(self.segments), index) - 1 -
+                   std::begin(self.segments));
+        if (self.segments[sindex + 1] - index >= N)
+            return get_elements(self, cinput, index, sindex - 1, y);
         else
         {
             vec<T, N> result;
             CMT_PRAGMA_CLANG(clang loop unroll_count(4))
             for (size_t i = 0; i < N; i++)
             {
-                sindex           = segments[sindex + 1] == index ? sindex + 1 : sindex;
-                result.data()[i] = get(cinput, index, sindex - 1, vec_t<T, 1>())[0];
+                sindex           = self.segments[sindex + 1] == index ? sindex + 1 : sindex;
+                result.data()[i] = get_elements(self, cinput, index, sindex - 1, vec_shape<T, 1>()).front();
                 index++;
             }
             return result;
@@ -336,10 +367,11 @@ public:
 
 protected:
     template <size_t N>
-    CMT_NOINLINE vec<T, N> get(cinput_t cinput, size_t index, size_t expr_index, vec_t<T, N> y)
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_sequence& self, cinput_t cinput,
+                                                    size_t index, size_t expr_index, vec_shape<T, N> y)
     {
         return cswitch(indicesfor_t<E...>(), expr_index,
-                       [&](auto val) { return this->argument(cinput, val, index, y); },
+                       [&](auto val) { return self.argument(cinput, val, index, y); },
                        [&]() { return zerovector(y); });
     }
 
@@ -347,20 +379,24 @@ protected:
 };
 
 template <typename Fn, typename E>
-struct expression_adjacent : expression_base<E>
+struct expression_adjacent : expression_with_arguments<E>
 {
     using value_type = value_type_of<E>;
     using T          = value_type;
 
-    expression_adjacent(Fn&& fn, E&& e) : expression_base<E>(std::forward<E>(e)), fn(std::forward<Fn>(fn)) {}
+    expression_adjacent(Fn&& fn, E&& e)
+        : expression_with_arguments<E>(std::forward<E>(e)), fn(std::forward<Fn>(fn))
+    {
+    }
 
     template <size_t N>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_adjacent& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N>)
     {
-        const vec<T, N> in      = this->argument_first(cinput, index, vec_t<T, N>());
-        const vec<T, N> delayed = insertleft(data, in);
-        data                    = in[N - 1];
-        return this->fn(in, delayed);
+        const vec<T, N> in      = self.argument_first(cinput, index, vec_shape<T, N>());
+        const vec<T, N> delayed = insertleft(self.data, in);
+        self.data               = in[N - 1];
+        return self.fn(in, delayed);
     }
     Fn fn;
     mutable value_type data = value_type(0);
@@ -370,7 +406,7 @@ struct expression_adjacent : expression_base<E>
 /** @brief Returns the subrange of the given expression
  */
 template <typename E1>
-CMT_INLINE internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t size = infinite_size)
+KFR_INTRINSIC internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t size = infinite_size)
 {
     return internal::expression_slice<E1>(std::forward<E1>(e1), start, size);
 }
@@ -378,15 +414,15 @@ CMT_INLINE internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t si
 /** @brief Returns the expression truncated to the given size
  */
 template <typename E1>
-CMT_INLINE internal::expression_slice<E1> truncate(E1&& e1, size_t size)
+KFR_INTRINSIC internal::expression_slice<E1> truncate(E1&& e1, size_t size)
 {
     return internal::expression_slice<E1>(std::forward<E1>(e1), 0, size);
 }
 
-/** @brief Returns reversed expression
+/** @brief Returns the reversed expression
  */
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_reverse<E1> reverse(E1&& e1)
+KFR_INTRINSIC internal::expression_reverse<E1> reverse(E1&& e1)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return internal::expression_reverse<E1>(std::forward<E1>(e1));
@@ -401,23 +437,24 @@ CMT_INLINE internal::expression_reverse<E1> reverse(E1&& e1)
  * @param truncate If ``true``, linspace returns exactly size elements, otherwise, returns infinite sequence
  */
 template <typename T1, typename T2, bool precise = false, typename TF = ftype<common_type<T1, T2>>>
-CMT_INLINE internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size,
-                                                               bool endpoint = false, bool truncate = false)
+KFR_INTRINSIC internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size,
+                                                                  bool endpoint = false,
+                                                                  bool truncate = false)
 {
     return internal::expression_linspace<TF, precise>(start, stop, size, endpoint, truncate);
 }
 KFR_FN(linspace)
 
 template <typename T, bool precise = false, typename TF = ftype<T>>
-CMT_INLINE internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size,
-                                                                   bool endpoint = false)
+KFR_INTRINSIC internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size,
+                                                                      bool endpoint = false)
 {
     return internal::expression_linspace<TF, precise>(symmetric_linspace, symsize, size, endpoint);
 }
 KFR_FN(symmlinspace)
 
 template <size_t size, typename... E>
-CMT_INLINE internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens)
+KFR_INTRINSIC internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens)
 {
     static_assert(size == sizeof...(E), "Lists must be of equal length");
     return internal::expression_sequence<decay<E>...>(list, std::forward<E>(gens)...);
@@ -428,7 +465,7 @@ KFR_FN(gen_sequence)
  * @brief Returns template expression that returns the result of calling \f$ fn(x_i, x_{i-1}) \f$
  */
 template <typename Fn, typename E1>
-CMT_INLINE internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1)
+KFR_INTRINSIC internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1)
 {
     return internal::expression_adjacent<Fn, E1>(std::forward<Fn>(fn), std::forward<E1>(e1));
 }
@@ -436,37 +473,38 @@ CMT_INLINE internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1)
 namespace internal
 {
 template <typename E>
-struct expression_padded : expression_base<E>
+struct expression_padded : expression_with_arguments<E>
 {
     using value_type = value_type_of<E>;
 
-    CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; }
+    KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; }
 
     expression_padded(value_type fill_value, E&& e)
-        : expression_base<E>(std::forward<E>(e)), fill_value(fill_value), input_size(e.size())
+        : expression_with_arguments<E>(std::forward<E>(e)), fill_value(fill_value), input_size(e.size())
     {
     }
 
     template <size_t N>
-    vec<value_type, N> operator()(cinput_t cinput, size_t index, vec_t<value_type, N> y) const
+    KFR_INTRINSIC friend vec<value_type, N> get_elements(const expression_padded& self, cinput_t cinput,
+                                                             size_t index, vec_shape<value_type, N> y)
     {
-        if (index >= input_size)
+        if (index >= self.input_size)
         {
-            return fill_value;
+            return self.fill_value;
         }
-        else if (index + N <= input_size)
+        else if (index + N <= self.input_size)
         {
-            return this->argument_first(cinput, index, y);
+            return self.argument_first(cinput, index, y);
         }
         else
         {
-            vec<value_type, N> x;
+            vec<value_type, N> x{};
             for (size_t i = 0; i < N; i++)
             {
-                if (index + i < input_size)
-                    x[i] = this->argument_first(cinput, index + i, vec_t<value_type, 1>())[0];
+                if (index + i < self.input_size)
+                    x[i] = self.argument_first(cinput, index + i, vec_shape<value_type, 1>()).front();
                 else
-                    x[i] = fill_value;
+                    x[i] = self.fill_value;
             }
             return x;
         }
@@ -507,44 +545,45 @@ private:
 };
 
 template <typename... E>
-struct expression_pack : expression_base<E...>
+struct expression_pack : expression_with_arguments<E...>
 {
     constexpr static size_t count = sizeof...(E);
 
-    expression_pack(E&&... e) : expression_base<E...>(std::forward<E>(e)...) {}
+    expression_pack(E&&... e) : expression_with_arguments<E...>(std::forward<E>(e)...) {}
     using value_type = vec<common_type<value_type_of<E>...>, count>;
     using T          = value_type;
 
-    using expression_base<E...>::size;
+    using expression_with_arguments<E...>::size;
 
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pack& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return this->call(cinput, fn::packtranspose(), index, y);
+        return self.call(cinput, fn::packtranspose(), index, y);
     }
 };
 
 template <typename... E>
-struct expression_unpack : private expression_base<E...>, output_expression
+struct expression_unpack : private expression_with_arguments<E...>, output_expression
 {
-    using expression_base<E...>::begin_block;
-    using expression_base<E...>::end_block;
+    using expression_with_arguments<E...>::begin_block;
+    using expression_with_arguments<E...>::end_block;
     using output_expression::begin_block;
     using output_expression::end_block;
     constexpr static size_t count = sizeof...(E);
 
-    expression_unpack(E&&... e) : expression_base<E...>(std::forward<E>(e)...) {}
+    expression_unpack(E&&... e) : expression_with_arguments<E...>(std::forward<E>(e)...) {}
 
-    using expression_base<E...>::size;
+    using expression_with_arguments<E...>::size;
 
     template <typename U, size_t N>
-    CMT_INLINE void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x)
+    KFR_MEM_INTRINSIC void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x)
     {
-        output(coutput, index, x, csizeseq_t<count>());
+        output(coutput, index, x, csizeseq<count>);
     }
 
     template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
-    CMT_INLINE expression_unpack& operator=(Input&& input)
+    KFR_MEM_INTRINSIC expression_unpack& operator=(Input&& input)
     {
         process(*this, std::forward<Input>(input));
         return *this;
@@ -554,7 +593,7 @@ private:
     template <typename U, size_t N, size_t... indices>
     void output(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x, csizes_t<indices...>)
     {
-        const vec<vec<U, N>, count> xx = compcast<vec<U, N>>(transpose<count>(flatten(x)));
+        const vec<vec<U, N>, count> xx = vec<vec<U, N>, count>::from_flatten(transpose<count>(flatten(x)));
         swallow{ (std::get<indices>(this->args)(coutput, index, xx[indices]), void(), 0)... };
     }
 };
@@ -600,12 +639,13 @@ task_partition<OutExpr, InExpr> partition(OutExpr&& output, InExpr&& input, size
 {
     static_assert(!is_infinite<OutExpr>::value || !is_infinite<InExpr>::value, "");
 
-    minimum_size            = minimum_size == 0 ? platform<T>::vector_width * 8 : minimum_size;
+    minimum_size            = minimum_size == 0 ? vector_width<T> * 8 : minimum_size;
     const size_t size       = size_min(output.size(), input.size());
-    const size_t chunk_size = align_up(std::max(size / count, minimum_size), platform<T>::vector_width);
+    const size_t chunk_size = align_up(std::max(size / count, minimum_size), vector_width<T>);
 
     task_partition<OutExpr, InExpr> result(std::forward<OutExpr>(output), std::forward<InExpr>(input), size,
                                            chunk_size, (size + chunk_size - 1) / chunk_size);
     return result;
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/bitwise.hpp b/include/kfr/base/bitwise.hpp
@@ -1,136 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "constants.hpp"
-#include "vec.hpp"
-
-namespace kfr
-{
-
-CMT_INLINE float bitwisenot(float x) { return fbitcast(~ubitcast(x)); }
-CMT_INLINE float bitwiseor(float x, float y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
-CMT_INLINE float bitwiseand(float x, float y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
-CMT_INLINE float bitwiseandnot(float x, float y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
-CMT_INLINE float bitwisexor(float x, float y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
-CMT_INLINE double bitwisenot(double x) { return fbitcast(~ubitcast(x)); }
-CMT_INLINE double bitwiseor(double x, double y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
-CMT_INLINE double bitwiseand(double x, double y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
-CMT_INLINE double bitwiseandnot(double x, double y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
-CMT_INLINE double bitwisexor(double x, double y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
-
-/// @brief Bitwise Not
-template <typename T1>
-CMT_INLINE T1 bitwisenot(const T1& x)
-{
-    return ~x;
-}
-KFR_FN(bitwisenot)
-
-/// @brief Bitwise And
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwiseand(const T1& x, const T2& y)
-{
-    return x & y;
-}
-template <typename T>
-constexpr CMT_INLINE T bitwiseand(initialvalue<T>)
-{
-    return constants<T>::allones();
-}
-KFR_FN(bitwiseand)
-
-/// @brief Bitwise And-Not
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y)
-{
-    return x & ~y;
-}
-template <typename T>
-constexpr inline T bitwiseandnot(initialvalue<T>)
-{
-    return constants<T>::allones();
-}
-KFR_FN(bitwiseandnot)
-
-/// @brief Bitwise Or
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwiseor(const T1& x, const T2& y)
-{
-    return x | y;
-}
-template <typename T>
-constexpr CMT_INLINE T bitwiseor(initialvalue<T>)
-{
-    return subtype<T>(0);
-}
-KFR_FN(bitwiseor)
-
-/// @brief Bitwise Xor (Exclusive Or)
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwisexor(const T1& x, const T2& y)
-{
-    return x ^ y;
-}
-template <typename T>
-constexpr CMT_INLINE T bitwisexor(initialvalue<T>)
-{
-    return subtype<T>();
-}
-KFR_FN(bitwisexor)
-
-/// @brief Bitwise Left shift
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> shl(const T1& left, const T2& right)
-{
-    return left << right;
-}
-KFR_FN(shl)
-
-/// @brief Bitwise Right shift
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> shr(const T1& left, const T2& right)
-{
-    return left >> right;
-}
-KFR_FN(shr)
-
-/// @brief Bitwise Left Rotate
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> rol(const T1& left, const T2& right)
-{
-    return shl(left, right) | shr(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
-}
-KFR_FN(rol)
-
-/// @brief Bitwise Right Rotate
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> ror(const T1& left, const T2& right)
-{
-    return shr(left, right) | shl(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
-}
-KFR_FN(ror)
-} // namespace kfr
diff --git a/include/kfr/base/clamp.hpp b/include/kfr/base/clamp.hpp
@@ -1,62 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/clamp.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns the first argument clamped to a range [lo, hi]
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
-          typename Tout = common_type<T1, T2, T3>>
-KFR_INTRIN Tout clamp(const T1& x, const T2& lo, const T3& hi)
-{
-    return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(lo), static_cast<Tout>(hi));
-}
-
-/// @brief Creates an expression that returns the first argument clamped to a range [lo, hi]
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INTRIN internal::expression_function<fn::clamp, E1, E2, E3> clamp(E1&& x, E2&& lo, E3&& hi)
-{
-    return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(lo), std::forward<E3>(hi) };
-}
-
-/// @brief Returns the first argument clamped to a range [0, hi]
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
-          typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout clamp(const T1& x, const T2& hi)
-{
-    return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(hi));
-}
-
-/// @brief Creates an expression that returns the first argument clamped to a range [0, hi]
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::clamp, E1, E2> clamp(E1&& x, E2&& hi)
-{
-    return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(hi) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/comparison.hpp b/include/kfr/base/comparison.hpp
@@ -1,149 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "constants.hpp"
-#include "expression.hpp"
-#include "vec.hpp"
-
-namespace kfr
-{
-
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> equal(const T1& x, const T2& y)
-{
-    return x == y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> notequal(const T1& x, const T2& y)
-{
-    return x != y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> less(const T1& x, const T2& y)
-{
-    return x < y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> greater(const T1& x, const T2& y)
-{
-    return x > y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> lessorequal(const T1& x, const T2& y)
-{
-    return x <= y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> greaterorequal(const T1& x, const T2& y)
-{
-    return x >= y;
-}
-KFR_FN(equal)
-KFR_FN(notequal)
-KFR_FN(less)
-KFR_FN(greater)
-KFR_FN(lessorequal)
-KFR_FN(greaterorequal)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::equal, E1, E2> operator==(E1&& e1, E2&& e2)
-{
-    return { fn::equal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::notequal, E1, E2> operator!=(E1&& e1, E2&& e2)
-{
-    return { fn::notequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::less, E1, E2> operator<(E1&& e1, E2&& e2)
-{
-    return { fn::less(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::greater, E1, E2> operator>(E1&& e1, E2&& e2)
-{
-    return { fn::greater(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::lessorequal, E1, E2> operator<=(E1&& e1, E2&& e2)
-{
-    return { fn::lessorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::greaterorequal, E1, E2> operator>=(E1&& e1, E2&& e2)
-{
-    return { fn::greaterorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isnan(const vec<T, N>& x)
-{
-    return x != x;
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isinf(const vec<T, N>& x)
-{
-    return x == constants<T>::infinity || x == -constants<T>::infinity;
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isfinite(const vec<T, N>& x)
-{
-    return !isnan(x) && !isinf(x);
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isnegative(const vec<T, N>& x)
-{
-    return (x & constants<T>::highbitmask()) != 0;
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> ispositive(const vec<T, N>& x)
-{
-    return !isnegative(x);
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> iszero(const vec<T, N>& x)
-{
-    return x == T();
-}
-
-template <typename T1, typename T2, typename T3>
-KFR_SINTRIN maskfor<common_type<T1, T2, T3>> inrange(const T1& x, const T2& min, const T3& max)
-{
-    return x >= min && x <= max;
-}
-} // namespace kfr
diff --git a/include/kfr/base/compiletime.hpp b/include/kfr/base/compiletime.hpp
@@ -1,84 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "constants.hpp"
-#include "operators.hpp"
-#include "types.hpp"
-
-namespace kfr
-{
-
-namespace compiletime
-{
-
-template <typename T>
-constexpr inline T select(bool c, T x, T y)
-{
-    return c ? x : y;
-}
-template <typename T>
-constexpr inline T trunc(T x)
-{
-    return static_cast<T>(static_cast<long long>(x));
-}
-template <typename T>
-constexpr inline T abs(T x)
-{
-    return x < T() ? -x : x;
-}
-template <typename T>
-constexpr inline T mulsign(T x, T y)
-{
-    return y < T() ? -x : x;
-}
-template <typename T>
-constexpr inline T sin(T x)
-{
-    x              = x - trunc(x / c_pi<T, 2>) * c_pi<T, 2>;
-    constexpr T c2 = -0.16665853559970855712890625;
-    constexpr T c4 = +8.31427983939647674560546875e-3;
-    constexpr T c6 = -1.85423981747590005397796630859375e-4;
-
-    x -= c_pi<T>;
-    T y = abs(x);
-    y   = select(y > c_pi<T, 1, 2>, c_pi<T> - y, y);
-    y   = mulsign(y, -x);
-
-    const T y2 = y * y;
-    T formula  = c6;
-    const T y3 = y2 * y;
-    formula    = fmadd(formula, y2, c4);
-    formula    = fmadd(formula, y2, c2);
-    formula    = formula * y3 + y;
-    return formula;
-}
-template <typename T>
-constexpr inline T cos(T x)
-{
-    return sin(x + c_pi<T, 1, 2>);
-}
-} // namespace compiletime
-} // namespace kfr
diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp
@@ -1,967 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "abs.hpp"
-#include "atan.hpp"
-#include "constants.hpp"
-#include "function.hpp"
-#include "hyperbolic.hpp"
-#include "log_exp.hpp"
-#include "min_max.hpp"
-#include "operators.hpp"
-#include "select.hpp"
-#include "sin_cos.hpp"
-#include "sqrt.hpp"
-
-#ifdef KFR_STD_COMPLEX
-#include <complex>
-#endif
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4814))
-
-namespace kfr
-{
-#ifdef KFR_STD_COMPLEX
-
-template <typename T>
-using complex = std::complex<T>;
-
-#else
-#ifndef KFR_CUSTOM_COMPLEX
-
-/**
- * @brief Represents the complex numbers. If KFR_STD_COMPLEX is defined, then kfr::complex is an alias for
- * std::complex.
- */
-template <typename T>
-struct complex
-{
-    static_assert(is_simd_type<T>::value, "Incorrect type for complex");
-    constexpr static bool is_pod = true;
-    constexpr complex() noexcept = default;
-    constexpr complex(T re) noexcept : re(re), im(0) {}
-    constexpr complex(T re, T im) noexcept : re(re), im(im) {}
-    constexpr complex(const complex&) noexcept = default;
-    constexpr complex(complex&&) noexcept      = default;
-    template <typename U>
-    constexpr complex(const complex<U>& other) noexcept
-        : re(static_cast<T>(other.re)), im(static_cast<T>(other.im))
-    {
-    }
-    template <typename U>
-    constexpr complex(complex<U>&& other) noexcept : re(std::move(other.re)), im(std::move(other.im))
-    {
-    }
-#ifdef CMT_COMPILER_GNU
-    constexpr complex& operator=(const complex&) noexcept = default;
-    constexpr complex& operator=(complex&&) noexcept = default;
-#else
-    complex& operator=(const complex&) = default;
-    complex& operator=(complex&&) = default;
-#endif
-    constexpr const T& real() const noexcept { return re; }
-    constexpr const T& imag() const noexcept { return im; }
-    constexpr void real(T value) noexcept { re = value; }
-    constexpr void imag(T value) noexcept { im = value; }
-    T re;
-    T im;
-
-    KFR_INTRIN friend complex operator+(const complex& x, const complex& y)
-    {
-        return (make_vector(x) + make_vector(y))[0];
-    }
-    KFR_INTRIN friend complex operator-(const complex& x, const complex& y)
-    {
-        return (make_vector(x) - make_vector(y))[0];
-    }
-    KFR_INTRIN friend complex operator*(const complex& x, const complex& y)
-    {
-        return (make_vector(x) * make_vector(y))[0];
-    }
-    KFR_INTRIN friend complex operator/(const complex& x, const complex& y)
-    {
-        return (make_vector(x) / make_vector(y))[0];
-    }
-
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator+(const complex& x, const U& y)
-    {
-        return static_cast<C>(x) + static_cast<C>(y);
-    }
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator-(const complex& x, const U& y)
-    {
-        return static_cast<C>(x) - static_cast<C>(y);
-    }
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator*(const complex& x, const U& y)
-    {
-        return static_cast<C>(x) * static_cast<C>(y);
-    }
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator/(const complex& x, const U& y)
-    {
-        return static_cast<C>(x) / static_cast<C>(y);
-    }
-
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator+(const U& x, const complex& y)
-    {
-        return static_cast<C>(x) + static_cast<C>(y);
-    }
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator-(const U& x, const complex& y)
-    {
-        return static_cast<C>(x) - static_cast<C>(y);
-    }
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator*(const U& x, const complex& y)
-    {
-        return static_cast<C>(x) * static_cast<C>(y);
-    }
-    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
-    KFR_INTRIN friend C operator/(const U& x, const complex& y)
-    {
-        return static_cast<C>(x) / static_cast<C>(y);
-    }
-    KFR_INTRIN friend complex operator-(const complex& x) { return (-make_vector(x))[0]; }
-};
-#endif
-#endif
-} // namespace kfr
-namespace cometa
-{
-template <typename T>
-struct compound_type_traits<kfr::complex<T>>
-{
-    constexpr static size_t width      = 2;
-    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
-    using subtype                      = T;
-    using deep_subtype                 = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar    = false;
-    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
-    template <typename U>
-    using rebind = kfr::complex<U>;
-    template <typename U>
-    using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>;
-
-    static constexpr subtype at(const kfr::complex<T>& value, size_t index)
-    {
-        return index == 0 ? value.real() : value.imag();
-    }
-};
-} // namespace cometa
-namespace kfr
-{
-
-/// @brief Alias for complex<f32>
-using c32 = complex<f32>;
-
-/// @brief Alias for complex<f64>
-using c64 = complex<f64>;
-
-/// @brief Alias for complex<fbase>
-using cbase = complex<fbase>;
-
-namespace internal
-{
-template <typename T>
-constexpr inline vec<T, 2> vcomplex(const complex<T>& v)
-{
-    return vec<T, 2>(v.real(), v.imag());
-}
-} // namespace internal
-
-/// @brief vec<> specialization for complex numbers. Implements all operators
-template <typename T, size_t N>
-struct vec<complex<T>, N> : private vec<T, 2 * N>
-{
-    using base = vec<T, 2 * N>;
-
-    using value_type = complex<T>;
-    constexpr static size_t size() noexcept { return N; }
-
-    using scalar_type = T;
-    constexpr static size_t scalar_size() noexcept { return 2 * N; }
-
-    using simd_type = typename base::simd_type;
-
-    constexpr vec() noexcept           = default;
-    constexpr vec(const vec&) noexcept = default;
-    CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default;
-    template <int = 0>
-    constexpr vec(const simd_type& simd) noexcept : base(simd)
-    {
-    }
-    KFR_I_CE vec(czeros_t) noexcept : base(czeros) {}
-    KFR_I_CE vec(cones_t) noexcept : base(cones) {}
-    KFR_I_CE vec(const value_type& s) noexcept : base(repeat<N>(vec<T, 2>(s.real(), s.imag()))) {}
-
-    template <typename U>
-    KFR_I_CE vec(const complex<U>& s) noexcept
-        : base(repeat<N>(vec<T, 2>(static_cast<T>(s.real()), static_cast<T>(s.imag()))))
-    {
-    }
-    template <typename U>
-    KFR_I_CE vec(const vec<complex<U>, N>& v) noexcept : base(static_cast<vec<T, N * 2>>(v.flatten()))
-    {
-    }
-
-    explicit KFR_I_CE vec(const vec<T, N * 2>& v) noexcept : base(v) {}
-
-    // from real
-    KFR_I_CE vec(const T& r) noexcept : base(interleave(vec<T, N>(r), vec<T, N>(czeros))) {}
-    // from real
-    template <typename U, typename = enable_if<std::is_convertible<U, T>::value>>
-    KFR_I_CE vec(const vec<U, N>& r) noexcept : base(interleave(vec<T, N>(r), vec<T, N>(czeros)))
-    {
-    }
-
-    // from list of vectors
-    template <typename... Us>
-    KFR_I_CE vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
-        : base(internal::vcomplex(s0), internal::vcomplex(s1),
-               internal::vcomplex(static_cast<value_type>(rest))...)
-    {
-    }
-
-    template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(value_type) * N)>
-    KFR_I_CE static vec frombits(const vec<U, M>& v) noexcept
-    {
-        return vec(vec<T, scalar_size()>::frombits(v.flatten()));
-    }
-
-#define KFR_B(x) static_cast<const base&>(x)
-    // math / bitwise / comparison operators
-    KFR_I_CE friend vec operator+(const vec& x) noexcept { return x; }
-    KFR_I_CE friend vec operator-(const vec& x) noexcept { return vec(-KFR_B(x)); }
-    KFR_I_CE friend vec operator~(const vec& x) noexcept { return vec(~KFR_B(x)); }
-
-    KFR_I_CE friend vec operator+(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) + KFR_B(y)); }
-    KFR_I_CE friend vec operator-(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) - KFR_B(y)); }
-    CMT_GNU_CONSTEXPR friend vec operator*(const vec& x, const vec& y) noexcept
-    {
-        const vec<scalar_type, N* 2> xx = x;
-        const vec<scalar_type, N* 2> yy = y;
-        return vec(subadd(xx * dupeven(yy), swap<2>(xx) * dupodd(yy)));
-    }
-    CMT_GNU_CONSTEXPR friend vec operator/(const vec& x, const vec& y) noexcept
-    {
-        const vec<scalar_type, N* 2> xx = x;
-        const vec<scalar_type, N* 2> yy = y;
-        const vec<scalar_type, N* 2> m  = (sqr(dupeven(yy)) + sqr(dupodd(yy)));
-        return vec(swap<2>(subadd(swap<2>(xx) * dupeven(yy), xx * dupodd(yy)) / m));
-    }
-
-    KFR_I_CE friend vec operator&(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) & KFR_B(y)); }
-    KFR_I_CE friend vec operator|(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) | KFR_B(y)); }
-    KFR_I_CE friend vec operator^(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) ^ KFR_B(y)); }
-
-    KFR_I_CE friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; }
-    KFR_I_CE friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; }
-    KFR_I_CE friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; }
-    KFR_I_CE friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; }
-
-    KFR_I_CE friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; }
-    KFR_I_CE friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; }
-    KFR_I_CE friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; }
-
-    KFR_I_CE friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
-    KFR_I_CE friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
-    KFR_I_CE friend vec operator++(vec& x, int) noexcept
-    {
-        const vec z = x;
-        ++x;
-        return z;
-    }
-    KFR_I_CE friend vec operator--(vec& x, int) noexcept
-    {
-        const vec z = x;
-        --x;
-        return z;
-    }
-
-    // shuffle
-    template <size_t... indices>
-    KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
-    {
-        return *base::shuffle(scale<2, indices...>());
-    }
-    template <size_t... indices>
-    KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
-    {
-        return *base::shuffle(y, scale<2, indices...>());
-    }
-
-    // element access
-    struct element;
-    KFR_I_CE value_type operator[](size_t index) const noexcept { return get(index); }
-    KFR_I_CE element operator[](size_t index) noexcept { return { *this, index }; }
-
-    KFR_I_CE value_type get(size_t index) const noexcept
-    {
-        return reinterpret_cast<const value_type(&)[N]>(*this)[index];
-    }
-    KFR_I_CE void set(size_t index, const value_type& s) noexcept
-    {
-        reinterpret_cast<value_type(&)[N]>(*this)[index] = s;
-    }
-    template <size_t index>
-    KFR_I_CE value_type get(csize_t<index>) const noexcept
-    {
-        return static_cast<const base&>(*this).shuffle(csizeseq_t<2, index * 2>());
-    }
-    template <size_t index>
-    KFR_I_CE void set(csize_t<index>, const value_type& s) noexcept
-    {
-        *this = vec(static_cast<const base&>(*this))
-                    .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * 2>() &&
-                                                   csizeseq_t<N>() < csize_t<(index + 1) * 2>()) *
-                                                      N);
-    }
-    struct element
-    {
-        KFR_I_CE operator value_type() const noexcept { return v.get(index); }
-        element& operator=(const value_type& s) noexcept
-        {
-            v.set(index, s);
-            return *this;
-        }
-
-        element& operator=(const element& s) noexcept
-        {
-            v.set(index, static_cast<value_type>(s));
-            return *this;
-        }
-        template <typename U, size_t M>
-        element& operator=(const typename vec<U, M>::element& s) noexcept
-        {
-            v.set(index, static_cast<value_type>(static_cast<U>(s)));
-            return *this;
-        }
-
-        vec& v;
-        size_t index;
-    };
-
-    template <bool aligned = false>
-    explicit KFR_I_CE vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept
-        : base(ptr_cast<T>(src), cbool_t<aligned>())
-    {
-    }
-    template <bool aligned = false>
-    const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept
-    {
-        base::write(ptr_cast<T>(dest), cbool_t<aligned>());
-        return *this;
-    }
-
-    const base& flatten() const noexcept { return *this; }
-    simd_type operator*() const noexcept { return base::operator*(); }
-    simd_type& operator*() noexcept { return base::operator*(); }
-};
-
-/// @brief Returns vector of complex values with real part duplicated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x)
-{
-    return compcast<complex<T>>(dupeven(compcast<T>(x)));
-}
-KFR_FN(cdupreal)
-
-/// @brief Returns vector of complex values with imaginary part duplicated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x)
-{
-    return compcast<complex<T>>(dupodd(compcast<T>(x)));
-}
-KFR_FN(cdupimag)
-
-/// @brief Returns vector of complex values with real and imaginary parts swapped
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x)
-{
-    return compcast<complex<T>>(swap<2>(compcast<T>(x)));
-}
-KFR_FN(cswapreim)
-
-/// @brief Returns vector of complex values with real part negated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x)
-{
-    return x ^ complex<T>(-T(), T());
-}
-KFR_FN(cnegreal)
-
-/// @brief Returns vector of complex values with imaginary part negated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x)
-{
-    return x ^ complex<T>(T(), -T());
-}
-KFR_FN(cnegimag)
-
-namespace internal
-{
-template <typename T>
-struct is_complex_impl : std::false_type
-{
-};
-template <typename T>
-struct is_complex_impl<complex<T>> : std::true_type
-{
-};
-
-// vector<complex> to vector<complex>
-template <typename To, typename From, size_t N>
-struct conversion<vec<complex<To>, N>, vec<complex<From>, N>>
-{
-    static_assert(!is_compound<To>::value, "");
-    static_assert(!is_compound<From>::value, "");
-    static vec<complex<To>, N> cast(const vec<complex<From>, N>& value)
-    {
-        return builtin_convertvector<complex<To>>(value);
-    }
-};
-
-// vector to vector<complex>
-template <typename To, typename From, size_t N>
-struct conversion<vec<complex<To>, N>, vec<From, N>>
-{
-    static_assert(!is_compound<To>::value, "");
-    static_assert(!is_compound<From>::value, "");
-    static vec<complex<To>, N> cast(const vec<From, N>& value)
-    {
-        const vec<To, N> casted = static_cast<vec<To, N>>(value);
-        return *interleave(casted, zerovector(casted));
-    }
-};
-
-} // namespace internal
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<complex<T>, N / 2> ccomp(const vec<T, N>& x)
-{
-    return compcast<complex<T>>(x);
-}
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N * 2> cdecom(const vec<complex<T>, N>& x)
-{
-    return compcast<T>(x);
-}
-
-/// @brief Returns the real part of the complex value
-template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)>
-constexpr CMT_INLINE T real(const T& value)
-{
-    return value;
-}
-
-/// @brief Returns the real part of the complex value
-template <typename T>
-constexpr CMT_INLINE T real(const complex<T>& value)
-{
-    return value.real();
-}
-
-/// @brief Returns the real part of the complex value
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> real(const vec<complex<T>, N>& value)
-{
-    return even(compcast<T>(value));
-}
-
-template <typename T>
-using realtype = decltype(kfr::real(std::declval<T>()));
-template <typename T>
-using realftype = ftype<decltype(kfr::real(std::declval<T>()))>;
-
-KFR_FN(real)
-
-/// @brief Returns the real part of the complex value
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::real, E1> real(E1&& x)
-{
-    return { {}, std::forward<E1>(x) };
-}
-
-/// @brief Returns the imaginary part of the complex value
-template <typename T>
-constexpr CMT_INLINE T imag(const complex<T>& value)
-{
-    return value.imag();
-}
-
-/// @brief Returns the imaginary part of the complex value
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> imag(const vec<complex<T>, N>& value)
-{
-    return odd(compcast<T>(value));
-}
-KFR_FN(imag)
-
-/// @brief Returns the imaginary part of the complex value
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::imag, E1> imag(E1&& x)
-{
-    return { {}, std::forward<E1>(x) };
-}
-
-/// @brief Constructs complex value from real and imaginary parts
-template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>>
-constexpr CMT_INLINE vec<complex<T>, N> make_complex(const vec<T1, N>& real, const vec<T2, N>& imag = T2(0))
-{
-    return compcast<complex<T>>(interleave(cast<T>(real), cast<T>(imag)));
-}
-
-/// @brief Constructs complex value from real and imaginary parts
-template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>>
-constexpr CMT_INLINE complex<T> make_complex(T1 real, T2 imag = T2(0))
-{
-    return complex<T>(cast<T>(real), cast<T>(imag));
-}
-
-namespace intrinsics
-{
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cconj(const vec<complex<T>, N>& x)
-{
-    return cnegimag(x);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> csin(const vec<complex<T>, N>& x)
-{
-    return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> csinh(const vec<complex<T>, N>& x)
-{
-    return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> ccos(const vec<complex<T>, N>& x)
-{
-    return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x)
-{
-    return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cabs(const vec<complex<T>, N>& x)
-{
-    const vec<T, N* 2> xx = sqr(cdecom(x));
-    return sqrt(even(xx) + odd(xx));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> carg(const vec<complex<T>, N>& x)
-{
-    const vec<T, N* 2> xx = cdecom(x);
-    return atan2(even(xx), odd(xx));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> clog(const vec<complex<T>, N>& x)
-{
-    return make_complex(log(cabs(x)), carg(x));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> clog2(const vec<complex<T>, N>& x)
-{
-    return clog(x) * c_recip_log_2<T>;
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> clog10(const vec<complex<T>, N>& x)
-{
-    return clog(x) * c_recip_log_10<T>;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cexp(const vec<complex<T>, N>& x)
-{
-    return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x)
-{
-    return cexp(x * c_log_2<T>);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x)
-{
-    return cexp(x * c_log_10<T>);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> polar(const vec<complex<T>, N>& x)
-{
-    return make_complex(cabs(x), carg(x));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x)
-{
-    return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x))));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cabsdup(const vec<T, N>& x)
-{
-    x = sqr(x);
-    return sqrt(x + swap<2>(x));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x)
-{
-    const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5);
-    return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t));
-}
-
-KFR_I_CONVERTER(cconj)
-KFR_I_CONVERTER(csin)
-KFR_I_CONVERTER(csinh)
-KFR_I_CONVERTER(ccos)
-KFR_I_CONVERTER(ccosh)
-KFR_I_CONVERTER(clog)
-KFR_I_CONVERTER(clog2)
-KFR_I_CONVERTER(clog10)
-KFR_I_CONVERTER(cexp)
-KFR_I_CONVERTER(cexp2)
-KFR_I_CONVERTER(cexp10)
-KFR_I_CONVERTER(polar)
-KFR_I_CONVERTER(cartesian)
-KFR_I_CONVERTER(csqrt)
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cabs(const vec<T, N>& a)
-{
-    return to_scalar(intrinsics::cabs(static_cast<vec<complex<T>, N>>(a)));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> carg(const vec<T, N>& a)
-{
-    return to_scalar(intrinsics::carg(static_cast<vec<complex<T>, N>>(a)));
-}
-template <typename T1>
-KFR_SINTRIN realtype<T1> cabs(const T1& a)
-{
-    using vecout = vec1<T1>;
-    return to_scalar(intrinsics::cabs(vecout(a)));
-}
-template <typename T1>
-KFR_SINTRIN realtype<T1> carg(const T1& a)
-{
-    using vecout = vec1<T1>;
-    return to_scalar(intrinsics::carg(vecout(a)));
-}
-} // namespace intrinsics
-
-KFR_I_FN(cconj)
-KFR_I_FN(csin)
-KFR_I_FN(csinh)
-KFR_I_FN(ccos)
-KFR_I_FN(ccosh)
-KFR_I_FN(cabs)
-KFR_I_FN(carg)
-KFR_I_FN(clog)
-KFR_I_FN(clog2)
-KFR_I_FN(clog10)
-KFR_I_FN(cexp)
-KFR_I_FN(cexp2)
-KFR_I_FN(cexp10)
-KFR_I_FN(polar)
-KFR_I_FN(cartesian)
-KFR_I_FN(csqrt)
-
-/// @brief Returns the sine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 csin(const T1& x)
-{
-    return intrinsics::csin(x);
-}
-
-/// @brief Returns template expression that returns the sine of the the complex value x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::csin, E1> csin(E1&& x)
-{
-    return { fn::csin(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic sine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 csinh(const T1& x)
-{
-    return intrinsics::csinh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic sine of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::csinh, E1> csinh(E1&& x)
-{
-    return { fn::csinh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the cosine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 ccos(const T1& x)
-{
-    return intrinsics::ccos(x);
-}
-
-/// @brief Returns template expression that returns the cosine of the the complex value x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::ccos, E1> ccos(E1&& x)
-{
-    return { fn::ccos(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cosine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 ccosh(const T1& x)
-{
-    return intrinsics::ccosh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cosine of the the complex value x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::ccosh, E1> ccosh(E1&& x)
-{
-    return { fn::ccosh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the absolute value (magnitude) of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC realtype<T1> cabs(const T1& x)
-{
-    return intrinsics::cabs(x);
-}
-
-/// @brief Returns template expression that returns the absolute value (magnitude) of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cabs, E1> cabs(E1&& x)
-{
-    return { fn::cabs(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the phase angle (argument) of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC realtype<T1> carg(const T1& x)
-{
-    return intrinsics::carg(x);
-}
-
-/// @brief Returns template expression that returns the phase angle (argument) of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::carg, E1> carg(E1&& x)
-{
-    return { fn::carg(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the complex conjugate of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cconj(const T1& x)
-{
-    return intrinsics::cconj(x);
-}
-
-/// @brief Returns template expression that returns the complex conjugate of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cconj, E1> cconj(E1&& x)
-{
-    return { fn::cconj(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the natural logarithm of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 clog(const T1& x)
-{
-    return intrinsics::clog(x);
-}
-
-/// @brief Returns template expression that returns the natural logarithm of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::clog, E1> clog(E1&& x)
-{
-    return { fn::clog(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the binary (base-2) logarithm of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 clog2(const T1& x)
-{
-    return intrinsics::clog2(x);
-}
-
-/// @brief Returns template expression that returns the binary (base-2) logarithm of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::clog2, E1> clog2(E1&& x)
-{
-    return { fn::clog2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the common (base-10) logarithm of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 clog10(const T1& x)
-{
-    return intrinsics::clog10(x);
-}
-
-/// @brief Returns template expression that returns the common (base-10) logarithm of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::clog10, E1> clog10(E1&& x)
-{
-    return { fn::clog10(), std::forward<E1>(x) };
-}
-
-/// @brief Returns \f$e\f$ raised to the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cexp(const T1& x)
-{
-    return intrinsics::cexp(x);
-}
-
-/// @brief Returns template expression that returns \f$e\f$ raised to the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cexp, E1> cexp(E1&& x)
-{
-    return { fn::cexp(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 2 raised to the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cexp2(const T1& x)
-{
-    return intrinsics::cexp2(x);
-}
-
-/// @brief Returns template expression that returns 2 raised to the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cexp2, E1> cexp2(E1&& x)
-{
-    return { fn::cexp2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 10 raised to the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cexp10(const T1& x)
-{
-    return intrinsics::cexp10(x);
-}
-
-/// @brief Returns template expression that returns 10 raised to the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cexp10, E1> cexp10(E1&& x)
-{
-    return { fn::cexp10(), std::forward<E1>(x) };
-}
-
-/// @brief Converts complex number to polar
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 polar(const T1& x)
-{
-    return intrinsics::polar(x);
-}
-
-/// @brief Returns template expression that converts complex number to polar
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::polar, E1> polar(E1&& x)
-{
-    return { fn::polar(), std::forward<E1>(x) };
-}
-
-/// @brief Converts complex number to cartesian
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cartesian(const T1& x)
-{
-    return intrinsics::cartesian(x);
-}
-
-/// @brief Returns template expression that converts complex number to cartesian
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cartesian, E1> cartesian(E1&& x)
-{
-    return { fn::cartesian(), std::forward<E1>(x) };
-}
-
-/// @brief Returns square root of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 csqrt(const T1& x)
-{
-    return intrinsics::csqrt(x);
-}
-
-/// @brief Returns template expression that returns square root of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::csqrt, E1> csqrt(E1&& x)
-{
-    return { fn::csqrt(), std::forward<E1>(x) };
-}
-} // namespace kfr
-
-namespace std
-{
-template <typename T1, typename T2>
-struct common_type<kfr::complex<T1>, kfr::complex<T2>>
-{
-    using type = kfr::complex<typename common_type<T1, T2>::type>;
-};
-template <typename T1, typename T2>
-struct common_type<kfr::complex<T1>, T2>
-{
-    using type = kfr::complex<typename common_type<T1, T2>::type>;
-};
-template <typename T1, typename T2>
-struct common_type<T1, kfr::complex<T2>>
-{
-    using type = kfr::complex<typename common_type<T1, T2>::type>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>>
-{
-    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>>
-{
-    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::complex<T1>, kfr::vec<T2, N>>
-{
-    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<T1, N>, kfr::complex<T2>>
-{
-    using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-} // namespace std
-
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/constants.hpp b/include/kfr/base/constants.hpp
@@ -1,299 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "types.hpp"
-#include <limits>
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4309))
-CMT_PRAGMA_MSVC(warning(disable : 4146))
-
-namespace kfr
-{
-
-#if CMT_COMPILER_GNU
-constexpr double infinity = __builtin_inf();
-constexpr double qnan     = __builtin_nan("");
-#else
-constexpr double infinity = HUGE_VAL;
-constexpr double qnan     = NAN;
-#endif
-
-template <typename T>
-struct constants
-{
-public:
-    using Tsub = subtype<T>;
-
-    constexpr static Tsub pi_s(int m, int d = 1) { return pi * m / d; }
-    constexpr static Tsub recip_pi_s(int m, int d = 1) { return recip_pi * m / d; }
-
-    constexpr static Tsub pi           = static_cast<Tsub>(3.1415926535897932384626433832795);
-    constexpr static Tsub sqr_pi       = static_cast<Tsub>(9.8696044010893586188344909998762);
-    constexpr static Tsub recip_pi     = static_cast<Tsub>(0.31830988618379067153776752674503);
-    constexpr static Tsub degtorad     = static_cast<Tsub>(pi / 180);
-    constexpr static Tsub radtodeg     = static_cast<Tsub>(pi * 180);
-    constexpr static Tsub e            = static_cast<Tsub>(2.718281828459045235360287471352662);
-    constexpr static Tsub recip_log_2  = static_cast<Tsub>(1.442695040888963407359924681001892137426645954);
-    constexpr static Tsub recip_log_10 = static_cast<Tsub>(0.43429448190325182765112891891661);
-    constexpr static Tsub log_2        = static_cast<Tsub>(0.69314718055994530941723212145818);
-    constexpr static Tsub log_10       = static_cast<Tsub>(2.3025850929940456840179914546844);
-    constexpr static Tsub sqrt_2       = static_cast<Tsub>(1.4142135623730950488016887242097);
-
-    constexpr static Tsub fold_constant_div = choose_const<Tsub>(
-        CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01));
-
-    constexpr static Tsub fold_constant_hi = choose_const<Tsub>(
-        CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01));
-    constexpr static Tsub fold_constant_rem1 =
-        choose_const<Tsub>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f),
-                           CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08));
-    constexpr static Tsub fold_constant_rem2 =
-        choose_const<Tsub>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f),
-                           CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15));
-
-    constexpr static Tsub epsilon     = std::numeric_limits<Tsub>::epsilon();
-    constexpr static Tsub infinity    = std::numeric_limits<Tsub>::infinity();
-    constexpr static Tsub neginfinity = -std::numeric_limits<Tsub>::infinity();
-    constexpr static Tsub qnan        = std::numeric_limits<Tsub>::quiet_NaN();
-
-#if CMT_COMPILER_GNU
-
-    CMT_PRAGMA_GNU(GCC diagnostic push)
-    CMT_PRAGMA_GNU(GCC diagnostic ignored "-Woverflow")
-
-    constexpr static Tsub allones()
-    {
-        if (is_same<Tsub, f32>::value)
-        {
-            return -__builtin_nanf("0xFFFFFFFF");
-        }
-        else if (is_same<Tsub, f64>::value)
-        {
-            return -__builtin_nan("0xFFFFFFFFFFFFFFFF");
-        }
-        else
-        {
-            return static_cast<Tsub>(-1ll);
-        }
-    }
-
-    constexpr static Tsub allzeros() { return Tsub(0); }
-
-    constexpr static Tsub highbitmask()
-    {
-        if (is_same<Tsub, f32>::value)
-        {
-            return -0.0f;
-        }
-        else if (is_same<Tsub, f64>::value)
-        {
-            return -0.0;
-        }
-        else
-        {
-            return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1));
-        }
-    }
-
-    constexpr static Tsub invhighbitmask()
-    {
-        if (is_same<Tsub, f32>::value)
-        {
-            return __builtin_nanf("0xFFFFFFFF");
-        }
-        else if (is_same<Tsub, f64>::value)
-        {
-            return __builtin_nan("0xFFFFFFFFFFFFFFFF");
-        }
-        else
-        {
-            return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1);
-        }
-    }
-    CMT_PRAGMA_GNU(GCC diagnostic pop)
-#else
-
-    static Tsub allones()
-    {
-        if (is_same<Tsub, f32>::value)
-        {
-            return static_cast<Tsub>(bitcast<f32>(0xFFFFFFFFu));
-        }
-        else if (is_same<Tsub, f64>::value)
-        {
-            return static_cast<Tsub>(bitcast<f64>(0xFFFFFFFFFFFFFFFFull));
-        }
-        else
-        {
-            return static_cast<Tsub>(-1ll);
-        }
-    }
-
-    constexpr static Tsub allzeros() { return Tsub(0); }
-
-    static Tsub highbitmask()
-    {
-        if (is_same<Tsub, f32>::value)
-        {
-            return static_cast<Tsub>(-0.0f);
-        }
-        else if (is_same<Tsub, f64>::value)
-        {
-            return static_cast<Tsub>(-0.0);
-        }
-        else
-        {
-            return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1));
-        }
-    }
-
-    static Tsub invhighbitmask()
-    {
-        if (is_same<Tsub, f32>::value)
-        {
-            return static_cast<Tsub>(bitcast<f32>(0x7FFFFFFFu));
-        }
-        else if (is_same<Tsub, f64>::value)
-        {
-            return static_cast<Tsub>(bitcast<f64>(0x7FFFFFFFFFFFFFFFull));
-        }
-        else
-        {
-            return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1);
-        }
-    }
-#endif
-};
-
-template <typename T>
-constexpr subtype<T> constants<T>::pi;
-template <typename T>
-constexpr subtype<T> constants<T>::sqr_pi;
-template <typename T>
-constexpr subtype<T> constants<T>::recip_pi;
-template <typename T>
-constexpr subtype<T> constants<T>::degtorad;
-template <typename T>
-constexpr subtype<T> constants<T>::radtodeg;
-template <typename T>
-constexpr subtype<T> constants<T>::e;
-template <typename T>
-constexpr subtype<T> constants<T>::recip_log_2;
-template <typename T>
-constexpr subtype<T> constants<T>::recip_log_10;
-template <typename T>
-constexpr subtype<T> constants<T>::log_2;
-template <typename T>
-constexpr subtype<T> constants<T>::log_10;
-template <typename T>
-constexpr subtype<T> constants<T>::sqrt_2;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_div;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_hi;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_rem1;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_rem2;
-template <typename T>
-constexpr subtype<T> constants<T>::epsilon;
-template <typename T>
-constexpr subtype<T> constants<T>::infinity;
-template <typename T>
-constexpr subtype<T> constants<T>::neginfinity;
-template <typename T>
-constexpr subtype<T> constants<T>::qnan;
-
-/// π (pi)
-/// c_pi<f64, 4>      = 4pi
-/// c_pi<f64, 3, 4>   = 3/4pi
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d);
-
-/// π² (pi²)
-/// c_sqr_pi<f64, 4>      = 4pi²
-/// c_sqr_pi<f64, 3, 4>   = 3/4pi²
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d);
-
-/// 1/π (1/pi)
-/// c_recip_pi<f64>       1/pi
-/// c_recip_pi<f64, 4>    4/pi
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d);
-
-/// degree to radian conversion factor
-template <typename T>
-constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>;
-
-/// radian to degree conversion factor
-template <typename T>
-constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>;
-
-/// e, Euler's number
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d);
-
-template <typename T>
-constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52;
-
-template <typename T>
-constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1;
-
-template <typename T>
-constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon());
-
-/// infinity
-template <typename T>
-constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity();
-
-/// -infinity
-template <typename T>
-constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity();
-
-/// Quiet NaN
-template <typename T>
-constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN();
-
-template <typename T>
-constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954);
-
-template <typename T>
-constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661);
-
-template <typename T>
-constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818);
-
-template <typename T>
-constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844);
-
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d);
-} // namespace kfr
-
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/conversion.hpp b/include/kfr/base/conversion.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup math
+/** @addtogroup conversion
  *  @{
  */
 /*
@@ -25,12 +25,15 @@
  */
 #pragma once
 
-#include "types.hpp"
+#include "../math/clamp.hpp"
+#include "../simd/types.hpp"
+#include "../simd/vec.hpp"
 #include "univector.hpp"
-#include "vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 enum class audio_sample_type
 {
@@ -179,7 +182,7 @@ template <typename Tout, typename Tin, typename Tout_traits = audio_sample_trait
 inline Tout convert_sample(const Tin& in)
 {
     constexpr auto scale = Tout_traits::scale / Tin_traits::scale;
-    return cast<Tout>(clamp(in * scale, -Tout_traits::scale, +Tout_traits::scale));
+    return innercast<Tout>(clamp(in * scale, -Tout_traits::scale, +Tout_traits::scale));
 }
 
 /// @brief Deinterleaves and converts audio samples
@@ -275,4 +278,5 @@ void convert(void* out, audio_sample_type out_type, const Tin* in, size_t size)
         convert(reinterpret_cast<type*>(out), in, size);
     });
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp
@@ -1,107 +0,0 @@
-/** @addtogroup shuffle
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "shuffle.hpp"
-#include "types.hpp"
-
-namespace kfr
-{
-
-namespace internal
-{
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative")
-
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
-
-template <u32 m, u32 shift, u32... values>
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
-{
-    return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>());
-}
-
-template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>)
-{
-    return bit_permute_step_impl(
-               x,
-               cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
-           (32 - bits);
-}
-
-template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>)
-{
-    return bit_permute_step_impl(
-               x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
-           (32 - bits);
-}
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-
-template <size_t radix, size_t bits>
-struct shuffle_index_digitreverse
-{
-    constexpr inline size_t operator()(size_t index) const noexcept
-    {
-        return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>());
-    }
-};
-} // namespace internal
-
-template <size_t radix, size_t group = 1, typename T, size_t N>
-CMT_INLINE vec<T, N> digitreverse(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(
-        csizeseq_t<N / group>().map(internal::shuffle_index_digitreverse<radix, ilog2(N / group)>())));
-}
-
-template <size_t groupsize = 1, typename T, size_t N>
-CMT_INLINE vec<T, N> bitreverse(const vec<T, N>& x)
-{
-    return digitreverse<2, groupsize>(x);
-}
-
-template <size_t groupsize = 1, typename T, size_t N>
-CMT_INLINE vec<T, N> digitreverse4(const vec<T, N>& x)
-{
-    return digitreverse<4, groupsize>(x);
-}
-
-template <size_t bits>
-constexpr inline u32 bitreverse(u32 x)
-{
-    return internal::digitreverse_impl<bits>(x, csize_t<2>());
-}
-
-template <size_t bits>
-constexpr inline u32 digitreverse4(u32 x)
-{
-    return internal::digitreverse_impl<bits>(x, csize_t<4>());
-}
-} // namespace kfr
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -25,9 +25,10 @@
  */
 #pragma once
 
-#include "platform.hpp"
-#include "types.hpp"
-#include "vec.hpp"
+#include "../simd/platform.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/types.hpp"
+#include "../simd/vec.hpp"
 
 #include <tuple>
 #ifdef KFR_STD_COMPLEX
@@ -36,9 +37,12 @@
 
 CMT_PRAGMA_GNU(GCC diagnostic push)
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses")
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 constexpr size_t inout_context_size = 16;
 
@@ -73,20 +77,20 @@ struct complex;
 
 constexpr size_t infinite_size = static_cast<size_t>(-1);
 
-CMT_INLINE constexpr size_t size_add(size_t x, size_t y)
+CMT_INTRINSIC constexpr size_t size_add(size_t x, size_t y)
 {
     return (x == infinite_size || y == infinite_size) ? infinite_size : x + y;
 }
 
-CMT_INLINE constexpr size_t size_sub(size_t x, size_t y)
+CMT_INTRINSIC constexpr size_t size_sub(size_t x, size_t y)
 {
     return (x == infinite_size || y == infinite_size) ? infinite_size : (x > y ? x - y : 0);
 }
 
-CMT_INLINE constexpr size_t size_min(size_t x) noexcept { return x; }
+CMT_INTRINSIC constexpr size_t size_min(size_t x) CMT_NOEXCEPT { return x; }
 
 template <typename... Ts>
-CMT_INLINE constexpr size_t size_min(size_t x, size_t y, Ts... rest) noexcept
+CMT_INTRINSIC constexpr size_t size_min(size_t x, size_t y, Ts... rest) CMT_NOEXCEPT
 {
     return size_min(x < y ? x : y, rest...);
 }
@@ -94,23 +98,23 @@ CMT_INLINE constexpr size_t size_min(size_t x, size_t y, Ts... rest) noexcept
 /// @brief Base class of all input expressoins
 struct input_expression
 {
-    CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; }
+    KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; }
 
     constexpr static bool is_incremental = false;
 
-    CMT_INLINE constexpr void begin_block(cinput_t, size_t) const {}
-    CMT_INLINE constexpr void end_block(cinput_t, size_t) const {}
+    KFR_MEM_INTRINSIC constexpr void begin_block(cinput_t, size_t) const {}
+    KFR_MEM_INTRINSIC constexpr void end_block(cinput_t, size_t) const {}
 };
 
 /// @brief Base class of all output expressoins
 struct output_expression
 {
-    CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; }
+    KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; }
 
     constexpr static bool is_incremental = false;
 
-    CMT_INLINE constexpr void begin_block(coutput_t, size_t) const {}
-    CMT_INLINE constexpr void end_block(coutput_t, size_t) const {}
+    KFR_MEM_INTRINSIC constexpr void begin_block(coutput_t, size_t) const {}
+    KFR_MEM_INTRINSIC constexpr void end_block(coutput_t, size_t) const {}
 };
 
 /// @brief Check if the type argument is an input expression
@@ -141,17 +145,14 @@ using is_numeric_args = and_t<is_numeric<Ts>...>;
 namespace internal
 {
 template <typename T, size_t N, typename Fn>
-static vec<T, N> get_fn_value(size_t index, Fn&& fn)
+inline vec<T, N> get_fn_value(size_t index, Fn&& fn)
 {
-    vec<T, N> x;
-    for (size_t i = 0; i < N; i++)
-        x[i] = fn(index + i);
-    return x;
+    return apply(fn, enumerate<size_t, N>() + index);
 }
 } // namespace internal
 
 template <typename E, typename Fn>
-static void test_expression(const E& expr, size_t size, Fn&& fn, const char* expression = nullptr)
+void test_expression(const E& expr, size_t size, Fn&& fn, const char* expression = nullptr)
 {
     using T                  = value_type_of<E>;
     ::testo::test_case* test = ::testo::active_test();
@@ -159,38 +160,20 @@ static void test_expression(const E& expr, size_t size, Fn&& fn, const char* exp
     test->check(c <= expr.size() == size, expression);
     if (expr.size() != size)
         return;
-    size = size_min(size, 100);
+    size                     = size_min(size, 200);
+    constexpr size_t maxsize = 2 + ilog2(vector_width<T> * 2);
     for (size_t i = 0; i < size;)
     {
         const size_t next_size =
-            std::min(prev_poweroftwo(size - i), static_cast<size_t>(1) << (std::rand() % 6));
-        switch (next_size)
-        {
-        case 1:
-            test->check(c <= expr(cinput, i, vec_t<T, 1>()) == internal::get_fn_value<T, 1>(i, fn),
-                        expression);
-            break;
-        case 2:
-            test->check(c <= expr(cinput, i, vec_t<T, 2>()) == internal::get_fn_value<T, 2>(i, fn),
-                        expression);
-            break;
-        case 4:
-            test->check(c <= expr(cinput, i, vec_t<T, 4>()) == internal::get_fn_value<T, 4>(i, fn),
-                        expression);
-            break;
-        case 8:
-            test->check(c <= expr(cinput, i, vec_t<T, 8>()) == internal::get_fn_value<T, 8>(i, fn),
-                        expression);
-            break;
-        case 16:
-            test->check(c <= expr(cinput, i, vec_t<T, 16>()) == internal::get_fn_value<T, 16>(i, fn),
-                        expression);
-            break;
-        case 32:
-            test->check(c <= expr(cinput, i, vec_t<T, 32>()) == internal::get_fn_value<T, 32>(i, fn),
+            std::min(prev_poweroftwo(size - i), static_cast<size_t>(1) << (std::rand() % maxsize));
+
+        cswitch(csize<1> << csizeseq<maxsize>, next_size, [&](auto x) {
+            constexpr size_t nsize = val_of(decltype(x)());
+            ::testo::scope s(as_string("i = ", i));
+            test->check(c <= get_elements(expr, cinput, i, vec_shape<T, nsize>()) ==
+                            internal::get_fn_value<T, nsize>(i, fn),
                         expression);
-            break;
-        }
+        });
         i += next_size;
     }
 }
@@ -208,33 +191,26 @@ template <typename T, typename Fn>
 struct expression_lambda : input_expression
 {
     using value_type = T;
-    CMT_INLINE expression_lambda(Fn&& fn) : fn(std::move(fn)) {}
+    KFR_MEM_INTRINSIC expression_lambda(Fn&& fn) : fn(std::move(fn)) {}
 
-    template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, cinput_t, size_t, vec_t<T, N>>::value)>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, cinput_t, size_t, vec_shape<T, N>>::value)>
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return fn(cinput, index, y);
+        return self.fn(cinput, index, y);
     }
 
     template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, size_t>::value)>
-    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t, size_t index,
+                                                    vec_shape<T, N>)
     {
-        vec<T, N> result;
-        for (size_t i = 0; i < N; i++)
-        {
-            result[i] = fn(index + i);
-        }
-        return result;
+        return apply(self.fn, enumerate<size_t, N>() + index);
     }
     template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn>::value)>
-    CMT_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t, size_t,
+                                                    vec_shape<T, N>)
     {
-        vec<T, N> result;
-        for (size_t i = 0; i < N; i++)
-        {
-            result[i] = fn();
-        }
-        return result;
+        return apply<N>(self.fn);
     }
 
     Fn fn;
@@ -269,19 +245,22 @@ namespace internal
 {
 
 template <typename... Args>
-struct expression_base : input_expression
+struct expression_with_arguments : input_expression
 {
-    CMT_INLINE constexpr size_t size() const noexcept { return size_impl(indicesfor_t<Args...>()); }
+    KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT
+    {
+        return size_impl(indicesfor_t<Args...>());
+    }
 
     constexpr static size_t count = sizeof...(Args);
-    expression_base()             = delete;
-    constexpr expression_base(Args&&... args) noexcept : args(std::forward<Args>(args)...) {}
+    expression_with_arguments()   = delete;
+    constexpr expression_with_arguments(Args&&... args) CMT_NOEXCEPT : args(std::forward<Args>(args)...) {}
 
-    CMT_INLINE void begin_block(cinput_t cinput, size_t size) const
+    KFR_MEM_INTRINSIC void begin_block(cinput_t cinput, size_t size) const
     {
         begin_block_impl(cinput, size, indicesfor_t<Args...>());
     }
-    CMT_INLINE void end_block(cinput_t cinput, size_t size) const
+    KFR_MEM_INTRINSIC void end_block(cinput_t cinput, size_t size) const
     {
         end_block_impl(cinput, size, indicesfor_t<Args...>());
     }
@@ -290,44 +269,48 @@ struct expression_base : input_expression
 
 protected:
     template <size_t... indices>
-    CMT_INLINE constexpr size_t size_impl(csizes_t<indices...>) const noexcept
+    KFR_MEM_INTRINSIC constexpr size_t size_impl(csizes_t<indices...>) const CMT_NOEXCEPT
     {
         return size_min(std::get<indices>(this->args).size()...);
     }
 
     template <typename Fn, typename T, size_t N>
-    CMT_INLINE vec<T, N> call(cinput_t cinput, Fn&& fn, size_t index, vec_t<T, N> x) const
+    KFR_MEM_INTRINSIC vec<T, N> call(cinput_t cinput, Fn&& fn, size_t index, vec_shape<T, N> x) const
     {
         return call_impl(cinput, std::forward<Fn>(fn), indicesfor_t<Args...>(), index, x);
     }
     template <size_t ArgIndex, typename U, size_t N,
               typename T = value_type_of<typename details::get_nth_type<ArgIndex, Args...>::type>>
-    CMT_INLINE vec<U, N> argument(cinput_t cinput, csize_t<ArgIndex>, size_t index, vec_t<U, N>) const
+    KFR_MEM_INTRINSIC vec<U, N> argument(cinput_t cinput, csize_t<ArgIndex>, size_t index,
+                                         vec_shape<U, N>) const
     {
         static_assert(ArgIndex < count, "Incorrect ArgIndex");
-        return static_cast<vec<U, N>>(std::get<ArgIndex>(this->args)(cinput, index, vec_t<T, N>()));
+        return get_elements(
+            static_cast<vec<U, N>>(std::get<ArgIndex>(this->args), cinput, index, vec_shape<T, N>()));
     }
     template <typename U, size_t N,
               typename T = value_type_of<typename details::get_nth_type<0, Args...>::type>>
-    CMT_INLINE vec<U, N> argument_first(cinput_t cinput, size_t index, vec_t<U, N>) const
+    KFR_MEM_INTRINSIC vec<U, N> argument_first(cinput_t cinput, size_t index, vec_shape<U, N>) const
     {
-        return static_cast<vec<U, N>>(std::get<0>(this->args)(cinput, index, vec_t<T, N>()));
+        return static_cast<vec<U, N>>(
+            get_elements(std::get<0>(this->args), cinput, index, vec_shape<T, N>()));
     }
 
 private:
     template <typename Fn, typename T, size_t N, size_t... indices>
-    CMT_INLINE vec<T, N> call_impl(cinput_t cinput, Fn&& fn, csizes_t<indices...>, size_t index,
-                                   vec_t<T, N>) const
+    KFR_MEM_INTRINSIC vec<T, N> call_impl(cinput_t cinput, Fn&& fn, csizes_t<indices...>, size_t index,
+                                          vec_shape<T, N>) const
     {
-        return fn(std::get<indices>(this->args)(cinput, index, vec_t<value_type_of<Args>, N>())...);
+        return fn(get_elements(std::get<indices>(this->args), cinput, index,
+                               vec_shape<value_type_of<Args>, N>())...);
     }
     template <size_t... indices>
-    CMT_INLINE void begin_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
+    KFR_MEM_INTRINSIC void begin_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
     {
         swallow{ (std::get<indices>(args).begin_block(cinput, size), 0)... };
     }
     template <size_t... indices>
-    CMT_INLINE void end_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
+    KFR_MEM_INTRINSIC void end_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
     {
         swallow{ (std::get<indices>(args).end_block(cinput, size), 0)... };
     }
@@ -338,14 +321,15 @@ struct expression_scalar : input_expression
 {
     using value_type    = T;
     expression_scalar() = delete;
-    constexpr expression_scalar(const T& val) noexcept : val(val) {}
-    constexpr expression_scalar(const vec<T, width>& val) noexcept : val(val) {}
+    constexpr expression_scalar(const T& val) CMT_NOEXCEPT : val(val) {}
+    constexpr expression_scalar(const vec<T, width>& val) CMT_NOEXCEPT : val(val) {}
     vec<T, width> val;
 
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_scalar& self, cinput_t, size_t,
+                                                    vec_shape<T, N>)
     {
-        return resize<N>(val);
+        return resize<N>(self.val);
     }
 };
 
@@ -377,27 +361,30 @@ template <typename T>
 using arg = typename internal::arg_impl<decay<T>, T>::type;
 
 template <typename Fn, typename... Args>
-struct expression_function : expression_base<arg<Args>...>
+struct expression_function : expression_with_arguments<arg<Args>...>
 {
     using value_type =
         subtype<decltype(std::declval<Fn>()(std::declval<vec<value_type_of<arg<Args>>, 1>>()...))>;
     using T = value_type;
 
-    expression_function(Fn&& fn, arg<Args>&&... args) noexcept
-        : expression_base<arg<Args>...>(std::forward<arg<Args>>(args)...), fn(std::forward<Fn>(fn))
+    expression_function(Fn&& fn, arg<Args>&&... args) CMT_NOEXCEPT
+        : expression_with_arguments<arg<Args>...>(std::forward<arg<Args>>(args)...),
+          fn(std::forward<Fn>(fn))
     {
     }
-    expression_function(const Fn& fn, arg<Args>&&... args) noexcept
-        : expression_base<arg<Args>...>(std::forward<arg<Args>>(args)...), fn(fn)
+    expression_function(const Fn& fn, arg<Args>&&... args) CMT_NOEXCEPT
+        : expression_with_arguments<arg<Args>...>(std::forward<arg<Args>>(args)...),
+          fn(fn)
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> x) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_function& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> x)
     {
-        return this->call(cinput, fn, index, x);
+        return self.call(cinput, self.fn, index, x);
     }
 
-    const Fn& get_fn() const noexcept { return fn; }
+    const Fn& get_fn() const CMT_NOEXCEPT { return fn; }
 
 protected:
     Fn fn;
@@ -405,25 +392,25 @@ protected:
 } // namespace internal
 
 template <typename A>
-CMT_INLINE internal::arg<A> e(A&& a)
+CMT_INTRINSIC internal::arg<A> e(A&& a)
 {
     return internal::arg<A>(std::forward<A>(a));
 }
 
 template <typename T>
-CMT_INLINE internal::expression_scalar<T> scalar(const T& val)
+CMT_INTRINSIC internal::expression_scalar<T> scalar(const T& val)
 {
     return internal::expression_scalar<T>(val);
 }
 
 template <typename T, size_t N>
-CMT_INLINE internal::expression_scalar<T, N> scalar(const vec<T, N>& val)
+CMT_INTRINSIC internal::expression_scalar<T, N> scalar(const vec<T, N>& val)
 {
     return internal::expression_scalar<T, N>(val);
 }
 
 template <typename Fn, typename... Args>
-CMT_INLINE internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&& fn, Args&&... args)
+CMT_INTRINSIC internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&& fn, Args&&... args)
 {
     return internal::expression_function<decay<Fn>, Args...>(std::forward<Fn>(fn),
                                                              std::forward<Args>(args)...);
@@ -434,17 +421,16 @@ CMT_INLINE internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&
  * @param args new arguments for the function
  */
 template <typename Fn, typename... OldArgs, typename... NewArgs>
-CMT_INLINE internal::expression_function<Fn, NewArgs...> rebind(
+CMT_INTRINSIC internal::expression_function<Fn, NewArgs...> rebind(
     const internal::expression_function<Fn, OldArgs...>& e, NewArgs&&... args)
 {
     return internal::expression_function<Fn, NewArgs...>(e.get_fn(), std::forward<NewArgs>(args)...);
 }
 
-template <cpu_t c = cpu_t::native, size_t width = 0, typename OutputExpr, typename InputExpr,
-          size_t groupsize = 1>
-CMT_INLINE static size_t process(OutputExpr&& out, const InputExpr& in, size_t start = 0,
-                                 size_t size = infinite_size, coutput_t coutput = nullptr,
-                                 cinput_t cinput = nullptr, csize_t<groupsize> = csize_t<groupsize>())
+template <size_t width = 0, typename OutputExpr, typename InputExpr, size_t groupsize = 1>
+CMT_INTRINSIC static size_t process(OutputExpr&& out, const InputExpr& in, size_t start = 0,
+                                    size_t size = infinite_size, coutput_t coutput = nullptr,
+                                    cinput_t cinput = nullptr, csize_t<groupsize> = csize_t<groupsize>())
 {
     using Tin = value_type_of<InputExpr>;
     static_assert(is_output_expression<OutputExpr>::value, "OutFn must be an expression");
@@ -453,24 +439,25 @@ CMT_INLINE static size_t process(OutputExpr&& out, const InputExpr& in, size_t s
     size = size_sub(size_min(out.size(), in.size(), size_add(size, start)), start);
     if (size == 0 || size == infinite_size)
         return size;
-    const size_t end = start + size;
     out.begin_block(coutput, size);
     in.begin_block(cinput, size);
 
 #ifdef NDEBUG
-    constexpr size_t w = width == 0 ? platform<Tin, c>::vector_capacity / 4 : width;
+    constexpr size_t w = width == 0 ? maximum_vector_size<Tin> : width;
 #else
-    constexpr size_t w = width == 0 ? platform<Tin, c>::vector_width : width;
+    constexpr size_t w = width == 0 ? vector_width<Tin> : width;
 #endif
 
+    static_assert(w > 0 && is_poweroftwo(w), "");
+
     size_t i = start;
 
     CMT_LOOP_NOUNROLL
     for (; i < start + size / w * w; i += w)
-        out(coutput, i, in(cinput, i, vec_t<Tin, w>()));
+        out(coutput, i, get_elements(in, cinput, i, vec_shape<Tin, w>()));
     CMT_LOOP_NOUNROLL
     for (; i < start + size / groupsize * groupsize; i += groupsize)
-        out(coutput, i, in(cinput, i, vec_t<Tin, groupsize>()));
+        out(coutput, i, get_elements(in, cinput, i, vec_shape<Tin, groupsize>()));
 
     in.end_block(cinput, size);
     out.end_block(coutput, size);
@@ -483,11 +470,12 @@ struct input_expression_base : input_expression
     virtual ~input_expression_base() {}
     virtual T input(size_t index) const = 0;
     template <typename U, size_t N>
-    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    friend KFR_INTRINSIC vec<U, N> get_elements(const input_expression_base& self, cinput_t, size_t index,
+                                                    vec_shape<U, N>)
     {
         vec<U, N> out;
         for (size_t i = 0; i < N; i++)
-            out[i] = static_cast<U>(input(index + i));
+            out[i] = static_cast<U>(self.input(index + i));
         return out;
     }
 };
@@ -499,12 +487,19 @@ struct output_expression_base : output_expression
     virtual void output(size_t index, const T& value) = 0;
 
     template <typename U, size_t N>
-    CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
+    KFR_MEM_INTRINSIC void operator()(coutput_t, size_t index, const vec<U, N>& value)
     {
         for (size_t i = 0; i < N; i++)
             output(index + i, static_cast<T>(value[i]));
     }
 };
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+CMT_INTRINSIC internal::expression_function<fn::interleave, E1, E2> interleave(E1&& x, E2&& y)
+{
+    return { fn::interleave(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
 
 CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/filter.hpp b/include/kfr/base/filter.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup math
+/** @addtogroup filter
  *  @{
  */
 /*
@@ -32,6 +32,8 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 /// @brief Abstract base class for filters with one argument. Mainly for DSP
 template <typename T>
@@ -131,16 +133,17 @@ protected:
 
 /// @brief Converts expression with placeholder to filter. Placeholder and filter must have the same type
 template <typename E, typename T = value_type_of<E>>
-KFR_SINTRIN expression_filter<T> to_filter(E&& e)
+KFR_INTRINSIC expression_filter<T> to_filter(E&& e)
 {
     return expression_filter<T>(to_pointer(std::move(e)));
 }
 
 /// @brief Converts expression with placeholder to filter. Placeholder and filter must have the same type
 template <typename T, typename E>
-KFR_SINTRIN expression_filter<T> to_filter(expression_pointer<T>&& e)
+KFR_INTRINSIC expression_filter<T> to_filter(expression_pointer<T>&& e)
 {
     return expression_filter<T>(std::move(e));
 }
 
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/fraction.hpp b/include/kfr/base/fraction.hpp
@@ -25,8 +25,7 @@
  */
 #pragma once
 
-#include "operators.hpp"
-#include "vec.hpp"
+#include "../simd/types.hpp"
 
 namespace kfr
 {
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -1,268 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "expression.hpp"
-#include "shuffle.hpp"
-#include "types.hpp"
-#include "vec.hpp"
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
-
-namespace kfr
-{
-
-#define KFR_I_CONVERTER(fn)                                                                                  \
-    template <typename T1, typename... Args, typename Tout = ::cometa::common_type<T1, Args...>>             \
-    KFR_SINTRIN Tout fn(const T1& a, const Args&... b)                                                       \
-    {                                                                                                        \
-        using vecout = vec1<Tout>;                                                                           \
-        return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
-    }
-
-#define KFR_I_FLT_CONVERTER(fn)                                                                              \
-    template <typename T1, typename... Args,                                                                 \
-              typename Tout = ::kfr::flt_type<::cometa::common_type<T1, Args...>>>                           \
-    KFR_SINTRIN Tout fn(const T1& a, const Args&... b)                                                       \
-    {                                                                                                        \
-        using vecout = vec1<Tout>;                                                                           \
-        return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
-    }
-
-namespace intrinsics
-{
-#ifdef CMT_ARCH_X86
-using f32sse = vec<f32, 4>;
-using f64sse = vec<f64, 2>;
-using i8sse  = vec<i8, 16>;
-using i16sse = vec<i16, 8>;
-using i32sse = vec<i32, 4>;
-using i64sse = vec<i64, 2>;
-using u8sse  = vec<u8, 16>;
-using u16sse = vec<u16, 8>;
-using u32sse = vec<u32, 4>;
-using u64sse = vec<u64, 2>;
-
-using f32avx = vec<f32, 8>;
-using f64avx = vec<f64, 4>;
-using i8avx  = vec<i8, 32>;
-using i16avx = vec<i16, 16>;
-using i32avx = vec<i32, 8>;
-using i64avx = vec<i64, 4>;
-using u8avx  = vec<u8, 32>;
-using u16avx = vec<u16, 16>;
-using u32avx = vec<u32, 8>;
-using u64avx = vec<u64, 4>;
-
-using f32avx512 = vec<f32, 16>;
-using f64avx512 = vec<f64, 8>;
-using i8avx512  = vec<i8, 64>;
-using i16avx512 = vec<i16, 32>;
-using i32avx512 = vec<i32, 16>;
-using i64avx512 = vec<i64, 8>;
-using u8avx512  = vec<u8, 64>;
-using u16avx512 = vec<u16, 32>;
-using u32avx512 = vec<u32, 16>;
-using u64avx512 = vec<u64, 8>;
-
-#else
-using f32neon = vec<f32, 4>;
-using f64neon = vec<f64, 2>;
-using i8neon  = vec<i8, 16>;
-using i16neon = vec<i16, 8>;
-using i32neon = vec<i32, 4>;
-using i64neon = vec<i64, 2>;
-using u8neon  = vec<u8, 16>;
-using u16neon = vec<u16, 8>;
-using u32neon = vec<u32, 4>;
-using u64neon = vec<u64, 2>;
-#endif
-
-template <cpu_t c, typename T>
-constexpr inline size_t next_simd_width(size_t n)
-{
-#ifdef CMT_ARCH_X86
-    return n > platform<T, cpu_t::sse2>::vector_width ? platform<T, c>::vector_width
-                                                      : platform<T, cpu_t::sse2>::vector_width;
-#endif
-#ifdef CMT_ARCH_ARM
-    return platform<T, cpu_t::neon>::vector_width;
-#endif
-}
-
-template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
-KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x)
-{
-    return extend<Nout>(x);
-}
-
-template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
-KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value)
-{
-    return widen<Nout>(x, value);
-}
-
-#define KFR_HANDLE_ALL_SIZES_1(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a)));                                                              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return concat(fn(low(a)), fn(high(a)));                                                              \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_FLT_1(fn)                                                                       \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
-    KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a)                                                   \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(cast<flt_type<T>>(a))));                                           \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
-    KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a)                                                   \
-    {                                                                                                        \
-        return concat(fn(low(cast<flt_type<T>>(a))), fn(high(cast<flt_type<T>>(a))));                        \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_F_1(fn)                                                                         \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_f_class<T>::value)>    \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a)));                                                              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_f_class<T>::value),   \
-              typename = void>                                                                               \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return concat(fn(low(a)), fn(high(a)));                                                              \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_I_1(fn)                                                                         \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_i_class<T>::value)>    \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a)));                                                              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_i_class<T>::value),   \
-              typename = void>                                                                               \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return concat(fn(low(a)), fn(high(a)));                                                              \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_U_1(fn)                                                                         \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_u_class<T>::value)>    \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a)));                                                              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_u_class<T>::value),   \
-              typename = void>                                                                               \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return concat(fn(low(a)), fn(high(a)));                                                              \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_NOT_F_1(fn)                                                                     \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && !is_f_class<T>::value)>   \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a)));                                                              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && !is_f_class<T>::value),  \
-              typename = void>                                                                               \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a)                                                             \
-    {                                                                                                        \
-        return concat(fn(low(a)), fn(high(a)));                                                              \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_2(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b)                                         \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a), expand_simd(b)));                                              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b)                                         \
-    {                                                                                                        \
-        return concat(fn(low(a), low(b)), fn(high(a), high(b)));                                             \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_2_INT(fn)                                                                       \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, int b)                                                      \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a), b));                                                           \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, int b)                                                      \
-    {                                                                                                        \
-        return concat(fn(low(a), b), fn(high(a), b));                                                        \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_3(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)                     \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c)));                              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)                     \
-    {                                                                                                        \
-        return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c)));                            \
-    }
-
-#define KFR_HANDLE_ALL_SIZES_4(fn)                                                                           \
-    template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>                            \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
-    {                                                                                                        \
-        return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d)));              \
-    }                                                                                                        \
-    template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>          \
-    KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
-    {                                                                                                        \
-        return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d)));           \
-    }
-
-template <typename T>
-using vec1 = conditional<is_vec<T>::value, T, vec<T, 1>>;
-
-template <typename T>
-inline T to_scalar(const T& value)
-{
-    return value;
-}
-template <typename T>
-inline T to_scalar(const vec<T, 1>& value)
-{
-    return value[0];
-}
-} // namespace intrinsics
-} // namespace kfr
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/function_expressions.hpp b/include/kfr/base/function_expressions.hpp
@@ -0,0 +1,30 @@
+/** @addtogroup expressions
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+namespace kfr
+{
+} // namespace kfr
diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp
@@ -1,60 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/gamma.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns the approximate gamma function of an argument
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> gamma(const T1& x)
-{
-    return intrinsics::gamma(x);
-}
-
-/// @brief Creates expression that returns the approximate gamma function of an argument
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::gamma, E1> gamma(E1&& x)
-{
-    return { fn::gamma(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the approximate factorial of an argument
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> factorial_approx(const T1& x)
-{
-    return intrinsics::factorial_approx(x);
-}
-
-/// @brief Creates expression that returns the approximate factorial of an argument
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::factorial_approx, E1> factorial_approx(E1&& x)
-{
-    return { fn::factorial_approx(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup expressions
+/** @addtogroup generators
  *  @{
  */
 /*
@@ -25,14 +25,16 @@
  */
 #pragma once
 
-#include "function.hpp"
-#include "log_exp.hpp"
-#include "select.hpp"
-#include "sin_cos.hpp"
-#include "vec.hpp"
+#include "../math/log_exp.hpp"
+#include "../math/select.hpp"
+#include "../math/sin_cos.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 namespace internal
 {
@@ -41,14 +43,15 @@ template <typename T, size_t width_, typename Class>
 struct generator : input_expression
 {
     constexpr static size_t width = width_;
-    using value_type                    = T;
+    using value_type              = T;
 
     constexpr static bool is_incremental = true;
 
     template <typename U, size_t N>
-    CMT_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N> t) const
+    friend KFR_INTRINSIC vec<U, N> get_elements(const generator& self, cinput_t, size_t,
+                                                    vec_shape<U, N> t)
     {
-        return generate(t);
+        return self.generate(t);
     }
 
     void resync(T start) const { ptr_cast<Class>(this)->sync(start); }
@@ -70,7 +73,7 @@ protected:
     }
 
     template <size_t N, KFR_ENABLE_IF(N == width)>
-    CMT_INLINE vec<T, N> generate(vec_t<T, N>) const
+    KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
     {
         const vec<T, N> result = value;
         call_next();
@@ -78,7 +81,7 @@ protected:
     }
 
     template <size_t N, KFR_ENABLE_IF(N < width)>
-    CMT_INLINE vec<T, N> generate(vec_t<T, N>) const
+    KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
     {
         const vec<T, N> result = narrow<N>(value);
         shift(csize_t<N>());
@@ -86,7 +89,7 @@ protected:
     }
 
     template <size_t N, KFR_ENABLE_IF(N > width)>
-    CMT_INLINE vec<T, N> generate(vec_t<T, N> x) const
+    KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N> x) const
     {
         const auto lo = generate(low(x));
         const auto hi = generate(high(x));
@@ -96,58 +99,64 @@ protected:
     mutable vec<T, width> value;
 };
 
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2)>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
 struct generator_linear : generator<T, width, generator_linear<T, width>>
 {
-    constexpr generator_linear(T start, T step) noexcept : step(step), vstep(step * width)
+    generator_linear(T start, T step) CMT_NOEXCEPT : step(step), vstep(step* width) { this->resync(start); }
+
+    KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
     {
-        this->resync(start);
+        this->value = start + enumerate<T, width>() * step;
     }
 
-    CMT_INLINE void sync(T start) const noexcept { this->value = start + enumerate<T, width>() * step; }
-
-    CMT_INLINE void next() const noexcept { this->value += vstep; }
+    KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += vstep; }
 
 protected:
     T step;
     T vstep;
 };
 
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP>
 struct generator_exp : generator<T, width, generator_exp<T, width>>
 {
-    generator_exp(T start, T step) noexcept : step(step), vstep(exp(make_vector(step * width))[0] - 1)
+    generator_exp(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp(make_vector(step* width))[0] - 1)
     {
         this->resync(start);
     }
 
-    CMT_INLINE void sync(T start) const noexcept { this->value = exp(start + enumerate<T, width>() * step); }
+    KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
+    {
+        this->value = exp(start + enumerate<T, width>() * step);
+    }
 
-    CMT_INLINE void next() const noexcept { this->value += this->value * vstep; }
+    KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
 
 protected:
     T step;
     T vstep;
 };
 
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP>
 struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
 {
-    generator_exp2(T start, T step) noexcept : step(step), vstep(exp2(make_vector(step * width))[0] - 1)
+    generator_exp2(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp2(make_vector(step* width))[0] - 1)
     {
         this->resync(start);
     }
 
-    CMT_INLINE void sync(T start) const noexcept { this->value = exp2(start + enumerate<T, width>() * step); }
+    KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
+    {
+        this->value = exp2(start + enumerate<T, width>() * step);
+    }
 
-    CMT_INLINE void next() const noexcept { this->value += this->value * vstep; }
+    KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
 
 protected:
     T step;
     T vstep;
 };
 
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP>
 struct generator_cossin : generator<T, width, generator_cossin<T, width>>
 {
     generator_cossin(T start, T step)
@@ -155,9 +164,9 @@ struct generator_cossin : generator<T, width, generator_cossin<T, width>>
     {
         this->resync(start);
     }
-    CMT_INLINE void sync(T start) const noexcept { this->value = init_cossin(step, start); }
+    KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { this->value = init_cossin(step, start); }
 
-    CMT_INLINE void next() const noexcept
+    KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT
     {
         this->value = this->value - subadd(alpha * this->value, beta * swap<2>(this->value));
     }
@@ -172,7 +181,7 @@ protected:
     }
 };
 
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(2, 4), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(2, 4), KFR_ARCH_DEP>
 struct generator_sin : generator<T, width, generator_sin<T, width>>
 {
     generator_sin(T start, T step)
@@ -180,14 +189,14 @@ struct generator_sin : generator<T, width, generator_sin<T, width>>
     {
         this->resync(start);
     }
-    CMT_INLINE void sync(T start) const noexcept
+    KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
     {
         const vec<T, width* 2> cs = splitpairs(cossin(dup(start + enumerate<T, width>() * step)));
         this->cos_value           = low(cs);
         this->value               = high(cs);
     }
 
-    CMT_INLINE void next() const noexcept
+    KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT
     {
         const vec<T, width> c = this->cos_value;
         const vec<T, width> s = this->value;
@@ -200,7 +209,7 @@ struct generator_sin : generator<T, width, generator_sin<T, width>>
     }
 
     template <size_t N>
-    void shift(csize_t<N>) const noexcept
+    void shift(csize_t<N>) const CMT_NOEXCEPT
     {
         const vec<T, width> oldvalue    = this->value;
         const vec<T, width> oldcosvalue = this->cos_value;
@@ -226,7 +235,7 @@ protected:
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_linear<TF> gen_linear(T1 start, T2 step)
+KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step)
 {
     return internal::generator_linear<TF>(start, step);
 }
@@ -238,7 +247,7 @@ KFR_SINTRIN internal::generator_linear<TF> gen_linear(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_exp<TF> gen_exp(T1 start, T2 step)
+KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step)
 {
     return internal::generator_exp<TF>(start, step);
 }
@@ -250,7 +259,7 @@ KFR_SINTRIN internal::generator_exp<TF> gen_exp(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
+KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
 {
     return internal::generator_exp2<TF>(start, step);
 }
@@ -266,7 +275,7 @@ KFR_SINTRIN internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
+KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
 {
     return internal::generator_cossin<TF>(start, step);
 }
@@ -278,8 +287,9 @@ KFR_SINTRIN internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
    \f]
  */
 template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_sin<TF> gen_sin(T1 start, T2 step)
+KFR_FUNCTION internal::generator_sin<TF> gen_sin(T1 start, T2 step)
 {
     return internal::generator_sin<TF>(start, step);
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/horizontal.hpp b/include/kfr/base/horizontal.hpp
@@ -1,119 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "operators.hpp"
-
-namespace kfr
-{
-
-namespace internal
-{
-
-template <typename T, typename ReduceFn>
-CMT_INLINE T horizontal_impl(const vec<T, 1>& value, ReduceFn&&)
-{
-    return T(value[0]);
-}
-
-template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))>
-CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
-{
-    return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce));
-}
-template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))>
-CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
-{
-    const T initial = reduce(initialvalue<T>());
-    return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce));
-}
-} // namespace internal
-
-template <typename T, size_t N, typename ReduceFn>
-CMT_INLINE T horizontal(const vec<T, N>& value, ReduceFn&& reduce)
-{
-    return internal::horizontal_impl(value, std::forward<ReduceFn>(reduce));
-}
-
-/// @brief Sum all elements of the vector
-template <typename T, size_t N>
-CMT_INLINE T hadd(const vec<T, N>& value)
-{
-    return horizontal(value, fn::add());
-}
-KFR_FN(hadd)
-
-/// @brief Multiply all elements of the vector
-template <typename T, size_t N>
-CMT_INLINE T hmul(const vec<T, N>& value)
-{
-    return horizontal(value, fn::mul());
-}
-KFR_FN(hmul)
-
-template <typename T, size_t N>
-CMT_INLINE T hbitwiseand(const vec<T, N>& value)
-{
-    return horizontal(value, fn::bitwiseand());
-}
-KFR_FN(hbitwiseand)
-template <typename T, size_t N>
-CMT_INLINE T hbitwiseor(const vec<T, N>& value)
-{
-    return horizontal(value, fn::bitwiseor());
-}
-KFR_FN(hbitwiseor)
-template <typename T, size_t N>
-CMT_INLINE T hbitwisexor(const vec<T, N>& value)
-{
-    return horizontal(value, fn::bitwisexor());
-}
-KFR_FN(hbitwisexor)
-
-/// @brief Calculate the Dot-Product of two vectors
-template <typename T, size_t N>
-CMT_INLINE T dot(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return hadd(x * y);
-}
-KFR_FN(dot)
-
-/// @brief Calculate the Arithmetic mean of all elements in the vector
-template <typename T, size_t N>
-CMT_INLINE T avg(const vec<T, N>& value)
-{
-    return hadd(value) / N;
-}
-KFR_FN(avg)
-
-/// @brief Calculate the RMS of all elements in the vector
-template <typename T, size_t N>
-CMT_INLINE T rms(const vec<T, N>& value)
-{
-    return internal::builtin_sqrt(hadd(value * value) / N);
-}
-KFR_FN(rms)
-} // namespace kfr
diff --git a/include/kfr/base/hyperbolic.hpp b/include/kfr/base/hyperbolic.hpp
@@ -1,120 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/hyperbolic.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns the hyperbolic sine of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sinh(const T1& x)
-{
-    return intrinsics::sinh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic sine of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinh, E1> sinh(E1&& x)
-{
-    return { fn::sinh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cosine of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cosh(const T1& x)
-{
-    return intrinsics::cosh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cosine of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cosh, E1> cosh(E1&& x)
-{
-    return { fn::cosh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic tangent of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> tanh(const T1& x)
-{
-    return intrinsics::tanh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic tangent of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::tanh, E1> tanh(E1&& x)
-{
-    return { fn::tanh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cotangent of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> coth(const T1& x)
-{
-    return intrinsics::coth(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cotangent of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::coth, E1> coth(E1&& x)
-{
-    return { fn::coth(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic sine of the even elements of the x and the hyperbolic cosine of the odd
-/// elements of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sinhcosh(const T1& x)
-{
-    return intrinsics::sinhcosh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic sine of the even elements of the x and the
-/// hyperbolic cosine of the odd elements of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinhcosh, E1> sinhcosh(E1&& x)
-{
-    return { fn::sinhcosh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cosine of the even elements of the x and the hyperbolic sine of the odd
-/// elements of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> coshsinh(const T1& x)
-{
-    return intrinsics::coshsinh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cosine of the even elements of the x and
-/// the hyperbolic sine of the odd elements of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::coshsinh, E1> coshsinh(E1&& x)
-{
-    return { fn::coshsinh(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/impl/abs.hpp b/include/kfr/base/impl/abs.hpp
@@ -1,126 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
-{
-    return x & constants<T>::invhighbitmask();
-}
-
-KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); }
-KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); }
-KFR_SINTRIN i8sse abs(const i8sse& x) { return _mm_abs_epi8(*x); }
-KFR_SINTRIN u64sse abs(const u64sse& x) { return x; }
-KFR_SINTRIN u32sse abs(const u32sse& x) { return x; }
-KFR_SINTRIN u16sse abs(const u16sse& x) { return x; }
-KFR_SINTRIN u8sse abs(const u8sse& x) { return x; }
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN i64avx abs(const i64avx& x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32avx abs(const i32avx& x) { return _mm256_abs_epi32(*x); }
-KFR_SINTRIN i16avx abs(const i16avx& x) { return _mm256_abs_epi16(*x); }
-KFR_SINTRIN i8avx abs(const i8avx& x) { return _mm256_abs_epi8(*x); }
-KFR_SINTRIN u64avx abs(const u64avx& x) { return x; }
-KFR_SINTRIN u32avx abs(const u32avx& x) { return x; }
-KFR_SINTRIN u16avx abs(const u16avx& x) { return x; }
-KFR_SINTRIN u8avx abs(const u8avx& x) { return x; }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN i64avx512 abs(const i64avx512& x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32avx512 abs(const i32avx512& x) { return _mm512_abs_epi32(*x); }
-KFR_SINTRIN i16avx512 abs(const i16avx512& x) { return _mm512_abs_epi16(*x); }
-KFR_SINTRIN i8avx512 abs(const i8avx512& x) { return _mm512_abs_epi8(*x); }
-KFR_SINTRIN u64avx512 abs(const u64avx512& x) { return x; }
-KFR_SINTRIN u32avx512 abs(const u32avx512& x) { return x; }
-KFR_SINTRIN u16avx512 abs(const u16avx512& x) { return x; }
-KFR_SINTRIN u8avx512 abs(const u8avx512& x) { return x; }
-#endif
-
-KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); }
-KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); }
-KFR_SINTRIN i32neon abs(const i32neon& x) { return vabsq_s32(*x); }
-#if defined CMT_ARCH_NEON64
-KFR_SINTRIN i64neon abs(const i64neon& x) { return vabsq_s64(*x); }
-#else
-KFR_SINTRIN i64neon abs(const i64neon& x) { return select(x >= 0, x, -x); }
-#endif
-
-KFR_SINTRIN u8neon abs(const u8neon& x) { return x; }
-KFR_SINTRIN u16neon abs(const u16neon& x) { return x; }
-KFR_SINTRIN u32neon abs(const u32neon& x) { return x; }
-KFR_SINTRIN u64neon abs(const u64neon& x) { return x; }
-
-KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); }
-#if defined CMT_ARCH_NEON64
-KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); }
-#else
-KFR_SINTRIN f64neon abs(const f64neon& x) { return x & constants<f64>::invhighbitmask(); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_1(abs)
-
-#else
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
-{
-    return x & constants<T>::invhighbitmask();
-}
-
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
-{
-    return select(x >= T(0), x, -x);
-}
-#endif
-KFR_I_CONVERTER(abs)
-} // namespace intrinsics
-
-KFR_I_FN(abs)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/asin_acos.hpp b/include/kfr/base/impl/asin_acos.hpp
@@ -1,58 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../atan.hpp"
-#include "../function.hpp"
-#include "../select.hpp"
-#include "../sqrt.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> asin(const vec<T, N>& x)
-{
-    const vec<Tout, N> xx = x;
-    return atan2(xx, sqrt(Tout(1) - xx * xx));
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> acos(const vec<T, N>& x)
-{
-    const vec<Tout, N> xx = x;
-    return -atan2(xx, sqrt(Tout(1) - xx * xx)) + constants<Tout>::pi * 0.5;
-}
-KFR_I_FLT_CONVERTER(asin)
-KFR_I_FLT_CONVERTER(acos)
-} // namespace intrinsics
-KFR_I_FN(asin)
-KFR_I_FN(acos)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/atan.hpp b/include/kfr/base/impl/atan.hpp
@@ -1,229 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-#include "../sin_cos.hpp"
-
-namespace kfr
-{
-namespace intrinsics
-{
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx)
-{
-    vec<f32, N> x = xx, y = yy;
-    vec<f32, N> s, t, u;
-    vec<i32, N> q;
-    q = select(x < 0, -2, 0);
-    x = select(x < 0, -x, x);
-    mask<i32, N> m;
-    m = y > x;
-    t = x;
-    x = select(m, y, x);
-    y = select(m, -t, y);
-    q = select(m, q + 1, q);
-    s = y / x;
-    t = s * s;
-    u = 0.00282363896258175373077393f;
-    u = fmadd(u, t, -0.0159569028764963150024414f);
-    u = fmadd(u, t, 0.0425049886107444763183594f);
-    u = fmadd(u, t, -0.0748900920152664184570312f);
-    u = fmadd(u, t, 0.106347933411598205566406f);
-    u = fmadd(u, t, -0.142027363181114196777344f);
-    u = fmadd(u, t, 0.199926957488059997558594f);
-    u = fmadd(u, t, -0.333331018686294555664062f);
-    t = u * t * s + s;
-    t = cast<f32>(q) * 1.5707963267948966192313216916398f + t;
-    return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx)
-{
-    vec<f64, N> x = xx, y = yy;
-    vec<f64, N> s, t, u;
-    vec<i64, N> q;
-    q = select(x < 0, i64(-2), i64(0));
-    x = select(x < 0, -x, x);
-    mask<i64, N> m;
-    m = y > x;
-    t = x;
-    x = select(m, y, x);
-    y = select(m, -t, y);
-    q = select(m, q + i64(1), q);
-    s = y / x;
-    t = s * s;
-    u = -1.88796008463073496563746e-05;
-    u = fmadd(u, t, 0.000209850076645816976906797);
-    u = fmadd(u, t, -0.00110611831486672482563471);
-    u = fmadd(u, t, 0.00370026744188713119232403);
-    u = fmadd(u, t, -0.00889896195887655491740809);
-    u = fmadd(u, t, 0.016599329773529201970117);
-    u = fmadd(u, t, -0.0254517624932312641616861);
-    u = fmadd(u, t, 0.0337852580001353069993897);
-    u = fmadd(u, t, -0.0407629191276836500001934);
-    u = fmadd(u, t, 0.0466667150077840625632675);
-    u = fmadd(u, t, -0.0523674852303482457616113);
-    u = fmadd(u, t, 0.0587666392926673580854313);
-    u = fmadd(u, t, -0.0666573579361080525984562);
-    u = fmadd(u, t, 0.0769219538311769618355029);
-    u = fmadd(u, t, -0.090908995008245008229153);
-    u = fmadd(u, t, 0.111111105648261418443745);
-    u = fmadd(u, t, -0.14285714266771329383765);
-    u = fmadd(u, t, 0.199999999996591265594148);
-    u = fmadd(u, t, -0.333333333333311110369124);
-    t = u * t * s + s;
-    t = cast<f64>(q) * 1.5707963267948966192313216916398 + t;
-    return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x)
-{
-    vec<f32, N> r           = atan2k(abs(y), x);
-    constexpr f32 pi        = 3.1415926535897932384626433832795f;
-    constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f;
-    constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f;
-    r                       = mulsign(r, x);
-    r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
-    r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
-    r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r);
-    r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
-    return r;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
-{
-    vec<f64, N> r           = atan2k(abs(y), x);
-    constexpr f64 pi        = 3.1415926535897932384626433832795;
-    constexpr f64 pi_over_2 = 1.5707963267948966192313216916398;
-    constexpr f64 pi_over_4 = 0.78539816339744830961566084581988;
-    r                       = mulsign(r, x);
-    r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r);
-    r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r);
-    r = select(y == 0.0, select(x < 0., pi, 0.), r);
-    r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
-    return r;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan(const vec<f32, N>& x)
-{
-    vec<f32, N> t, u;
-    vec<i32, N> q;
-    q             = select(x < 0.f, 2, 0);
-    vec<f32, N> s = select(x < 0.f, -x, x);
-    q             = select(s > 1.f, q | 1, q);
-    s             = select(s > 1.f, 1.0f / s, s);
-    t             = s * s;
-    u             = 0.00282363896258175373077393f;
-    u             = fmadd(u, t, -0.0159569028764963150024414f);
-    u             = fmadd(u, t, 0.0425049886107444763183594f);
-    u             = fmadd(u, t, -0.0748900920152664184570312f);
-    u             = fmadd(u, t, 0.106347933411598205566406f);
-    u             = fmadd(u, t, -0.142027363181114196777344f);
-    u             = fmadd(u, t, 0.199926957488059997558594f);
-    u             = fmadd(u, t, -0.333331018686294555664062f);
-    t             = s + s * (t * u);
-    t             = select((q & 1) != 0, 1.570796326794896557998982f - t, t);
-    t             = select((q & 2) != 0, -t, t);
-    return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan(const vec<f64, N>& x)
-{
-    vec<f64, N> t, u;
-    vec<i64, N> q;
-    q             = select(x < 0.0, i64(2), i64(0));
-    vec<f64, N> s = select(x < 0.0, -x, x);
-    q             = select(s > 1.0, q | 1, q);
-    s             = select(s > 1.0, 1.0 / s, s);
-    t             = s * s;
-    u             = -1.88796008463073496563746e-05;
-    u             = fmadd(u, t, 0.000209850076645816976906797);
-    u             = fmadd(u, t, -0.00110611831486672482563471);
-    u             = fmadd(u, t, 0.00370026744188713119232403);
-    u             = fmadd(u, t, -0.00889896195887655491740809);
-    u             = fmadd(u, t, 0.016599329773529201970117);
-    u             = fmadd(u, t, -0.0254517624932312641616861);
-    u             = fmadd(u, t, 0.0337852580001353069993897);
-    u             = fmadd(u, t, -0.0407629191276836500001934);
-    u             = fmadd(u, t, 0.0466667150077840625632675);
-    u             = fmadd(u, t, -0.0523674852303482457616113);
-    u             = fmadd(u, t, 0.0587666392926673580854313);
-    u             = fmadd(u, t, -0.0666573579361080525984562);
-    u             = fmadd(u, t, 0.0769219538311769618355029);
-    u             = fmadd(u, t, -0.090908995008245008229153);
-    u             = fmadd(u, t, 0.111111105648261418443745);
-    u             = fmadd(u, t, -0.14285714266771329383765);
-    u             = fmadd(u, t, 0.199999999996591265594148);
-    u             = fmadd(u, t, -0.333333333333311110369124);
-    t             = s + s * (t * u);
-    t             = select((q & 1) != 0, 1.570796326794896557998982 - t, t);
-    t             = select((q & 2) != 0, -t, t);
-    return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atandeg(const vec<f32, N>& x)
-{
-    return atan(x) * c_radtodeg<f32>;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atandeg(const vec<f64, N>& x)
-{
-    return atan(x) * c_radtodeg<f64>;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2deg(const vec<f32, N>& y, const vec<f32, N>& x)
-{
-    return atan2(y, x) * c_radtodeg<f32>;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2deg(const vec<f64, N>& y, const vec<f64, N>& x)
-{
-    return atan2(y, x) * c_radtodeg<f64>;
-}
-
-KFR_I_FLT_CONVERTER(atan)
-KFR_I_FLT_CONVERTER(atan2)
-KFR_I_FLT_CONVERTER(atandeg)
-KFR_I_FLT_CONVERTER(atan2deg)
-} // namespace intrinsics
-KFR_I_FN(atan)
-KFR_I_FN(atandeg)
-KFR_I_FN(atan2)
-KFR_I_FN(atan2deg)
-} // namespace kfr
diff --git a/include/kfr/base/impl/clamp.hpp b/include/kfr/base/impl/clamp.hpp
@@ -1,56 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../min_max.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T>
-KFR_SINTRIN T clamp(const T& x, const T& lo, const T& hi)
-{
-    return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
-{
-    return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
-{
-    return max(min(x, hi), zerovector<T, N>());
-}
-} // namespace intrinsics
-KFR_I_FN(clamp)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/gamma.hpp b/include/kfr/base/impl/gamma.hpp
@@ -1,72 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "../function.hpp"
-#include "../log_exp.hpp"
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-#if CMT_HAS_WARNING("-Wc99-extensions")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
-#endif
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-template <typename T>
-constexpr T gamma_precalc[] = {
-    0x2.81b263fec4e08p+0,  0x3.07b4100e04448p+16, -0xa.a0da01d4d4e2p+16, 0xf.05ccb27bb9dbp+16,
-    -0xa.fa79616b7c6ep+16, 0x4.6dd6c10d4df5p+16,  -0xf.a2304199eb4ap+12, 0x1.c21dd4aade3dp+12,
-    -0x1.62f981f01cf84p+8, 0x5.a937aa5c48d98p+0,  -0x3.c640bf82e2104p-8, 0xc.914c540f959cp-24,
-};
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> gamma(const vec<T, N>& z)
-{
-    constexpr size_t Count = arraysize(gamma_precalc<T>);
-    vec<T, N> accm         = gamma_precalc<T>[0];
-    CMT_LOOP_UNROLL
-    for (size_t k = 1; k < Count; k++)
-        accm += gamma_precalc<T>[k] / (z + cast<utype<T>>(k));
-    accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5);
-    return accm / z;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> factorial_approx(const vec<T, N>& x)
-{
-    return gamma(x + T(1));
-}
-KFR_I_FLT_CONVERTER(gamma)
-KFR_I_FLT_CONVERTER(factorial_approx)
-} // namespace intrinsics
-KFR_I_FN(gamma)
-KFR_I_FN(factorial_approx)
-
-} // namespace kfr
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/impl/hyperbolic.hpp b/include/kfr/base/impl/hyperbolic.hpp
@@ -1,100 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../log_exp.hpp"
-#include "../min_max.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sinh(const vec<T, N>& x)
-{
-    const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
-    return (exp(xx) - exp(-xx)) * Tout(0.5);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cosh(const vec<T, N>& x)
-{
-    const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
-    return (exp(xx) + exp(-xx)) * Tout(0.5);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> tanh(const vec<T, N>& x)
-{
-    const vec<Tout, N> a = exp(2 * x);
-    return (a - 1) / (a + 1);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> coth(const vec<T, N>& x)
-{
-    const vec<Tout, N> a = exp(2 * x);
-    return (a + 1) / (a - 1);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sinhcosh(const vec<T, N>& x)
-{
-    const vec<Tout, N> a = exp(x);
-    const vec<Tout, N> b = exp(-x);
-    return subadd(a, b) * Tout(0.5);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> coshsinh(const vec<T, N>& x)
-{
-    const vec<Tout, N> a = exp(x);
-    const vec<Tout, N> b = exp(-x);
-    return addsub(a, b) * Tout(0.5);
-}
-
-KFR_I_FLT_CONVERTER(sinh)
-KFR_I_FLT_CONVERTER(cosh)
-KFR_I_FLT_CONVERTER(tanh)
-KFR_I_FLT_CONVERTER(coth)
-KFR_I_FLT_CONVERTER(sinhcosh)
-KFR_I_FLT_CONVERTER(coshsinh)
-} // namespace intrinsics
-KFR_I_FN(sinh)
-KFR_I_FN(cosh)
-KFR_I_FN(tanh)
-KFR_I_FN(coth)
-KFR_I_FN(sinhcosh)
-KFR_I_FN(coshsinh)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/log_exp.hpp b/include/kfr/base/impl/log_exp.hpp
@@ -1,315 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../clamp.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../min_max.hpp"
-#include "../operators.hpp"
-#include "../round.hpp"
-#include "../select.hpp"
-#include "../shuffle.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <size_t N>
-KFR_SINTRIN vec<i32, N> vilogbp1(const vec<f32, N>& d)
-{
-    mask<i32, N> m = d < 5.421010862427522E-20f;
-    vec<i32, N> q  = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff;
-    q              = select(m, q - (64 + 0x7e), q - 0x7e);
-    return q;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<i64, N> vilogbp1(const vec<f64, N>& d)
-{
-    mask<i64, N> m = d < 4.9090934652977266E-91;
-    vec<i64, N> q  = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff;
-    q              = select(m, q - (300 + 0x03fe), q - 0x03fe);
-    return q;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q)
-{
-    vec<i32, N> m        = q >> 31;
-    m                    = (((m + q) >> 6) - m) << 4;
-    const vec<i32, N> qq = q - (m << 2);
-    m                    = clamp(m + 0x7f, vec<i32, N>(0xff));
-    vec<f32, N> u        = pow4(bitcast<f32>(cast<i32>(m) << 23));
-    return x * u * bitcast<f32>((cast<i32>(qq + 0x7f)) << 23);
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q)
-{
-    vec<i64, N> m        = q >> 31;
-    m                    = (((m + q) >> 9) - m) << 7;
-    const vec<i64, N> qq = q - (m << 2);
-    m                    = clamp(m + 0x3ff, i64(0x7ff));
-    vec<f64, N> u        = pow4(bitcast<f64>(cast<i64>(m) << 52));
-    return x * u * bitcast<f64>((cast<i64>(qq + 0x3ff)) << 52);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> logb(const vec<T, N>& x)
-{
-    return select(x == T(), -c_infinity<T>, static_cast<vec<T, N>>(vilogbp1(x) - 1));
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> log(const vec<f32, N>& d)
-{
-    vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f );
-    vec<f32, N> m = vldexpk(d, -e);
-
-    vec<f32, N> x  = (m - 1.0f) / (m + 1.0f);
-    vec<f32, N> x2 = x * x;
-
-    vec<f32, N> sp = select(d < 0, constants<f32>::qnan, constants<f32>::neginfinity);
-
-    vec<f32, N> t = 0.2371599674224853515625f;
-    t             = fmadd(t, x2, 0.285279005765914916992188f);
-    t             = fmadd(t, x2, 0.400005519390106201171875f);
-    t             = fmadd(t, x2, 0.666666567325592041015625f);
-    t             = fmadd(t, x2, 2.0f);
-
-    x = x * t + c_log_2<f32> * cast<f32>(e);
-    x = select(d > 0, x, sp);
-
-    return x;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d)
-{
-    vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 );
-    vec<f64, N> m = vldexpk(d, -e);
-
-    vec<f64, N> x  = (m - 1.0) / (m + 1.0);
-    vec<f64, N> x2 = x * x;
-
-    vec<f64, N> sp = select(d < 0, constants<f64>::qnan, constants<f64>::neginfinity);
-
-    vec<f64, N> t = 0.148197055177935105296783;
-    t             = fmadd(t, x2, 0.153108178020442575739679);
-    t             = fmadd(t, x2, 0.181837339521549679055568);
-    t             = fmadd(t, x2, 0.22222194152736701733275);
-    t             = fmadd(t, x2, 0.285714288030134544449368);
-    t             = fmadd(t, x2, 0.399999999989941956712869);
-    t             = fmadd(t, x2, 0.666666666666685503450651);
-    t             = fmadd(t, x2, 2);
-
-    x = x * t + constants<f64>::log_2 * cast<f64>(e);
-    x = select(d > 0, x, sp);
-
-    return x;
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> log2(const vec<T, N>& x)
-{
-    return log(cast<Tout>(x)) * constants<Tout>::recip_log_2;
-}
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> log10(const vec<T, N>& x)
-{
-    return log(cast<Tout>(x)) * constants<Tout>::recip_log_10;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d)
-{
-    const f32 ln2_part1 = 0.6931457519f;
-    const f32 ln2_part2 = 1.4286067653e-6f;
-
-    vec<i32, N> q = cast<i32>(floor(d * constants<f32>::recip_log_2));
-    vec<f32, N> s, u;
-
-    s = fmadd(cast<f32>(q), -ln2_part1, d);
-    s = fmadd(cast<f32>(q), -ln2_part2, s);
-
-    const f32 c2 = 0.4999999105930328369140625f;
-    const f32 c3 = 0.166668415069580078125f;
-    const f32 c4 = 4.16539050638675689697265625e-2f;
-    const f32 c5 = 8.378830738365650177001953125e-3f;
-    const f32 c6 = 1.304379315115511417388916015625e-3f;
-    const f32 c7 = 2.7555381529964506626129150390625e-4f;
-
-    u = c7;
-    u = fmadd(u, s, c6);
-    u = fmadd(u, s, c5);
-    u = fmadd(u, s, c4);
-    u = fmadd(u, s, c3);
-    u = fmadd(u, s, c2);
-
-    u = s * s * u + s + 1.0f;
-    u = vldexpk(u, q);
-
-    u = select(d == constants<f32>::neginfinity, 0.f, u);
-
-    return u;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d)
-{
-    const f64 ln2_part1 = 0.69314717501401901245;
-    const f64 ln2_part2 = 5.545926273775592108e-009;
-
-    vec<i64, N> q = cast<i64>(floor(d * +constants<f64>::recip_log_2));
-    vec<f64, N> s, u;
-
-    s = fmadd(cast<f64>(q), -ln2_part1, d);
-    s = fmadd(cast<f64>(q), -ln2_part2, s);
-
-    const f64 c2  = 0.499999999999994948485237955537741072475910186767578;
-    const f64 c3  = 0.166666666667024204739888659787538927048444747924805;
-    const f64 c4  = 4.16666666578945840693215529881854308769106864929199e-2;
-    const f64 c5  = 8.3333334397461874404333670440792047884315252304077e-3;
-    const f64 c6  = 1.3888881489747750223179290074426717183087021112442e-3;
-    const f64 c7  = 1.9841587032493949419205414574918222569976933300495e-4;
-    const f64 c8  = 2.47929324077393282239802768662784160369483288377523e-5;
-    const f64 c9  = 2.77076037925831049422552981864598109496000688523054e-6;
-    const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7;
-    const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8;
-
-    u = c11;
-    u = fmadd(u, s, c10);
-    u = fmadd(u, s, c9);
-    u = fmadd(u, s, c8);
-    u = fmadd(u, s, c7);
-    u = fmadd(u, s, c6);
-    u = fmadd(u, s, c5);
-    u = fmadd(u, s, c4);
-    u = fmadd(u, s, c3);
-    u = fmadd(u, s, c2);
-
-    u = s * s * u + s + 1.0;
-    u = vldexpk(u, q);
-
-    u = select(d == constants<f64>::neginfinity, 0.0, u);
-
-    return u;
-}
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> exp2(const vec<T, N>& x)
-{
-    return exp(x * constants<Tout>::log_2);
-}
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> exp10(const vec<T, N>& x)
-{
-    return exp(x * constants<Tout>::log_10);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b)
-{
-    const vec<T, N> t       = exp(b * log(abs(a)));
-    const mask<T, N> isint  = floor(b) == b;
-    const mask<T, N> iseven = (cast<itype<T>>(b) & 1) == 0;
-    return select(
-        a > T(), t,
-        select(a == T(), T(), select(isint, select(iseven, t, -t), broadcast<N>(constants<T>::qnan))));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b)
-{
-    return exp(reciprocal(b) * log(x));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cbrt(const vec<T, N>& x)
-{
-    return pow<T, N>(x, T(0.333333333333333333333333333333333));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cbrt(const vec<T, N>& x)
-{
-    return cbrt(cast<Tout>(x));
-}
-
-KFR_I_FLT_CONVERTER(exp)
-KFR_I_FLT_CONVERTER(exp2)
-KFR_I_FLT_CONVERTER(exp10)
-KFR_I_FLT_CONVERTER(log)
-KFR_I_FLT_CONVERTER(log2)
-KFR_I_FLT_CONVERTER(log10)
-KFR_I_FLT_CONVERTER(logb)
-KFR_I_FLT_CONVERTER(pow)
-KFR_I_FLT_CONVERTER(root)
-KFR_I_FLT_CONVERTER(cbrt)
-
-template <typename T1, typename T2>
-KFR_SINTRIN flt_type<common_type<T1, T2>> logn(const T1& a, const T2& b)
-{
-    return log(a) / log(b);
-}
-
-template <typename T1, typename T2>
-KFR_SINTRIN flt_type<common_type<T1, T2>> logm(const T1& a, const T2& b)
-{
-    return log(a) * b;
-}
-
-template <typename T1, typename T2, typename T3>
-KFR_SINTRIN flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& m, const T3& a)
-{
-    return exp(fmadd(x, m, a));
-}
-
-template <typename T1, typename T2, typename T3>
-KFR_SINTRIN flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& m, const T3& a)
-{
-    return fmadd(log(x), m, a);
-}
-} // namespace intrinsics
-KFR_I_FN(exp)
-KFR_I_FN(exp2)
-KFR_I_FN(exp10)
-KFR_I_FN(log)
-KFR_I_FN(log2)
-KFR_I_FN(log10)
-KFR_I_FN(logb)
-KFR_I_FN(logn)
-KFR_I_FN(logm)
-KFR_I_FN(exp_fmadd)
-KFR_I_FN(log_fmadd)
-KFR_I_FN(pow)
-KFR_I_FN(root)
-KFR_I_FN(cbrt)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/logical.hpp b/include/kfr/base/impl/logical.hpp
@@ -1,289 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <size_t bits>
-struct bitmask
-{
-    using type = conditional<(bits > 32), uint64_t,
-                             conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>;
-
-    bitmask(type val) : value(val) {}
-
-    template <typename Itype>
-    bitmask(Itype val) : value(static_cast<type>(val))
-    {
-    }
-
-    type value;
-};
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-#if defined CMT_ARCH_SSE41
-
-// horizontal OR
-KFR_SINTRIN bool bittestany(const u8sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const u16sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const u32sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const u64sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i8sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i16sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i32sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i64sse& x) { return !_mm_testz_si128(*x, *x); }
-
-// horizontal AND
-KFR_SINTRIN bool bittestall(const u8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-#endif
-
-#if defined CMT_ARCH_AVX
-// horizontal OR
-KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_ps(*x, *x); }
-KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_pd(*x, *x); }
-
-KFR_SINTRIN bool bittestany(const f32avx& x) { return !_mm256_testz_ps(*x, *x); }
-KFR_SINTRIN bool bittestany(const f64avx& x) { return !_mm256_testz_pd(*x, *x); }
-
-KFR_SINTRIN bool bittestany(const u8avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const u16avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const u32avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const u64avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i8avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i16avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i32avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i64avx& x) { return !_mm256_testz_si256(*x, *x); }
-
-// horizontal AND
-KFR_SINTRIN bool bittestall(const f32sse& x) { return _mm_testc_ps(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const f64sse& x) { return _mm_testc_pd(*x, *allonesvector(x)); }
-
-KFR_SINTRIN bool bittestall(const f32avx& x) { return _mm256_testc_ps(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const f64avx& x) { return _mm256_testc_pd(*x, *allonesvector(x)); }
-
-KFR_SINTRIN bool bittestall(const u8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-
-#if defined CMT_ARCH_AVX512
-// horizontal OR
-KFR_SINTRIN bool bittestany(const f32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const f64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u8avx512& x) { return _mm512_test_epi8_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u16avx512& x) { return _mm512_test_epi16_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i8avx512& x) { return _mm512_test_epi8_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i16avx512& x) { return _mm512_test_epi16_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
-
-// horizontal AND
-KFR_SINTRIN bool bittestall(const f32avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const f64avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u8avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u16avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u32avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u64avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i8avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i16avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i32avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i64avx512& x) { return !bittestany(~x); }
-
-#endif
-
-#elif defined CMT_ARCH_SSE41
-KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
-KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
-KFR_SINTRIN bool bittestall(const f32sse& x)
-{
-    return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x)));
-}
-KFR_SINTRIN bool bittestall(const f64sse& x)
-{
-    return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x)));
-}
-#endif
-
-#if !defined CMT_ARCH_SSE41
-
-KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); }
-KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); }
-KFR_SINTRIN bool bittestany(const u8sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const u16sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const u32sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const u64sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i8sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i16sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i32sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i64sse& x) { return _mm_movemask_epi8(*x); }
-
-KFR_SINTRIN bool bittestall(const f32sse& x) { return !_mm_movemask_ps(*~x); }
-KFR_SINTRIN bool bittestall(const f64sse& x) { return !_mm_movemask_pd(*~x); }
-KFR_SINTRIN bool bittestall(const u8sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const u16sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const u32sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const u64sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i8sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i16sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i32sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i64sse& x) { return !_mm_movemask_epi8(*~x); }
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
-    return bittestall(expand_simd(a, internal::maskbits<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
-    return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
-    return bittestany(expand_simd(a, internal::maskbits<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
-    return bittestany(low(a)) || bittestany(high(a));
-}
-
-#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN bool bittestall(const u32neon& a)
-{
-    const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a));
-    return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
-}
-
-KFR_SINTRIN bool bittestany(const u32neon& a)
-{
-    const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a));
-    return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
-}
-KFR_SINTRIN bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); }
-
-KFR_SINTRIN bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
-    return bittestall(expand_simd(a, internal::maskbits<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
-    return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
-    return bittestany(expand_simd(a, internal::maskbits<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
-    return bittestany(low(a)) || bittestany(high(a));
-}
-
-#else
-
-template <typename T, size_t N>
-KFR_SINTRIN bitmask<N> getmask(const vec<T, N>& x)
-{
-    typename bitmask<N>::type val = 0;
-    for (size_t i = 0; i < N; i++)
-    {
-        val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
-    }
-    return val;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestany(const vec<T, N>& x)
-{
-    return getmask(x).value;
-}
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestany(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return bittestany(x & y);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestall(const vec<T, N>& x)
-{
-    return !getmask(~x).value;
-}
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestall(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return !bittestany(~x & y);
-}
-#endif
-} // namespace intrinsics
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/min_max.hpp b/include/kfr/base/impl/min_max.hpp
@@ -1,232 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); }
-KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); }
-KFR_SINTRIN u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(*x, *y); }
-KFR_SINTRIN i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(*x, *y); }
-
-KFR_SINTRIN f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(*x, *y); }
-KFR_SINTRIN f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(*x, *y); }
-KFR_SINTRIN u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(*x, *y); }
-KFR_SINTRIN i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(*x, *y); }
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(*x, *y); }
-KFR_SINTRIN i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(*x, *y); }
-KFR_SINTRIN i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(*x, *y); }
-KFR_SINTRIN u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(*x, *y); }
-KFR_SINTRIN i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(*x, *y); }
-KFR_SINTRIN u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(*x, *y); }
-
-KFR_SINTRIN u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(*x, *y); }
-KFR_SINTRIN i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(*x, *y); }
-KFR_SINTRIN i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(*x, *y); }
-KFR_SINTRIN u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(*x, *y); }
-KFR_SINTRIN i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(*x, *y); }
-KFR_SINTRIN u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(*x, *y); }
-
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(*x, *y); }
-KFR_SINTRIN i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(*x, *y); }
-KFR_SINTRIN i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(*x, *y); }
-KFR_SINTRIN i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(*x, *y); }
-KFR_SINTRIN u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(*x, *y); }
-KFR_SINTRIN u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(*x, *y); }
-KFR_SINTRIN i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(*x, *y); }
-KFR_SINTRIN i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(*x, *y); }
-KFR_SINTRIN i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(*x, *y); }
-KFR_SINTRIN u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(*x, *y); }
-KFR_SINTRIN i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(*x, *y); }
-KFR_SINTRIN u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(*x, *y); }
-KFR_SINTRIN i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(*x, *y); }
-KFR_SINTRIN u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(*x, *y); }
-
-KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(*x, *y); }
-KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(*x, *y); }
-KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(*x, *y); }
-KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(*x, *y); }
-
-KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(*x, *y); }
-KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(*x, *y); }
-KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(*x, *y); }
-KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(*x, *y); }
-#else
-KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
-KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
-KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
-KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
-#endif
-
-#if defined CMT_ARCH_AVX
-KFR_SINTRIN f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(*x, *y); }
-KFR_SINTRIN f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(*x, *y); }
-KFR_SINTRIN f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(*x, *y); }
-KFR_SINTRIN f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(*x, *y); }
-#endif
-
-#if defined CMT_ARCH_SSE41
-KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(*x, *y); }
-KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(*x, *y); }
-KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(*x, *y); }
-KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(*x, *y); }
-
-KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(*x, *y); }
-KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(*x, *y); }
-KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(*x, *y); }
-KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(*x, *y); }
-#else
-KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
-
-KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
-
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); }
-KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); }
-KFR_SINTRIN i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(*x, *y); }
-KFR_SINTRIN u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(*x, *y); }
-KFR_SINTRIN i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(*x, *y); }
-KFR_SINTRIN u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(*x, *y); }
-KFR_SINTRIN i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
-KFR_SINTRIN u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
-
-KFR_SINTRIN i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(*x, *y); }
-KFR_SINTRIN u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(*x, *y); }
-KFR_SINTRIN i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(*x, *y); }
-KFR_SINTRIN u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(*x, *y); }
-KFR_SINTRIN i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(*x, *y); }
-KFR_SINTRIN u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(*x, *y); }
-KFR_SINTRIN i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
-KFR_SINTRIN u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
-
-KFR_SINTRIN f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(*x, *y); }
-KFR_SINTRIN f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(*x, *y); }
-#if defined CMT_ARCH_NEON64
-KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(*x, *y); }
-KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(*x, *y); }
-#else
-KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
-KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return select(x < y, x, y);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return select(x > y, x, y);
-}
-#endif
-
-template <typename T>
-KFR_SINTRIN T min(initialvalue<T>)
-{
-    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
-                                                : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_SINTRIN T max(initialvalue<T>)
-{
-    return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
-                                                : std::numeric_limits<T>::min();
-}
-template <typename T>
-KFR_SINTRIN T absmin(initialvalue<T>)
-{
-    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
-                                                : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_SINTRIN T absmax(initialvalue<T>)
-{
-    return 0;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return min(abs(x), abs(y));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return max(abs(x), abs(y));
-}
-
-KFR_I_CONVERTER(min)
-KFR_I_CONVERTER(max)
-KFR_I_CONVERTER(absmin)
-KFR_I_CONVERTER(absmax)
-} // namespace intrinsics
-KFR_I_FN(min)
-KFR_I_FN(max)
-KFR_I_FN(absmin)
-KFR_I_FN(absmax)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/modzerobessel.hpp b/include/kfr/base/impl/modzerobessel.hpp
@@ -1,105 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../log_exp.hpp"
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-#if CMT_HAS_WARNING("-Wc99-extensions")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
-#endif
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
-{
-    constexpr static T bessel_coef[] = { T(0.25),
-                                         T(0.027777777777777776236),
-                                         T(0.0017361111111111110147),
-                                         T(6.9444444444444444384e-005),
-                                         T(1.9290123456790123911e-006),
-                                         T(3.9367598891408417495e-008),
-                                         T(6.1511873267825652335e-010),
-                                         T(7.5940584281266239246e-012),
-                                         T(7.5940584281266233693e-014),
-                                         T(6.2760813455591932909e-016),
-                                         T(4.3583898233049949985e-018),
-                                         T(2.5789288895295827557e-020),
-                                         T(1.3157800456783586208e-022),
-                                         T(5.8479113141260384983e-025),
-                                         T(2.2843403570804837884e-027),
-                                         T(7.904291893012054025e-030),
-                                         T(2.4395962632753252792e-032),
-                                         T(6.75788438580422547e-035),
-                                         T(1.689471096451056426e-037),
-                                         T(3.8310002187098784929e-040),
-                                         T(7.9152897080782616517e-043),
-                                         T(1.4962740468957016443e-045),
-                                         T(2.5976979980828152196e-048),
-                                         T(4.1563167969325041577e-051),
-                                         T(6.1483976285983795968e-054),
-                                         T(8.434015951438105991e-057),
-                                         T(1.0757673407446563809e-059),
-                                         T(1.2791526049282476926e-062),
-                                         T(1.4212806721424974034e-065),
-                                         T(1.4789601166935457918e-068),
-                                         T(1.4442969889585408123e-071),
-                                         T(1.3262598613026086927e-074),
-                                         T(1.1472836170437790782e-077),
-                                         T(9.3655805472961564331e-081),
-                                         T(7.2265282000741942594e-084),
-                                         T(5.2786911614858977913e-087),
-                                         T(3.6556032974279072401e-090),
-                                         T(2.4034209713529963119e-093),
-                                         T(1.5021381070956226783e-096) };
-
-    const vec<T, N> x_2     = x * 0.5;
-    const vec<T, N> x_2_sqr = x_2 * x_2;
-    vec<T, N> num           = x_2_sqr;
-    vec<T, N> result;
-    result = 1 + x_2_sqr;
-
-    CMT_LOOP_UNROLL
-    for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
-    {
-        result = fmadd((num *= x_2_sqr), bessel_coef[i], result);
-    }
-    return result;
-}
-
-KFR_I_CONVERTER(modzerobessel)
-} // namespace intrinsics
-KFR_I_FN(modzerobessel)
-
-} // namespace kfr
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/impl/round.hpp b/include/kfr/base/impl/round.hpp
@@ -1,255 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../operators.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ss(V)                                                                            \
-    _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_sd(V)                                                                            \
-    _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
-#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
-
-#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm512_trunc_ps(V) _mm512_roundscale_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm512_roundnearest_ps(V) _mm512_roundscale_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm512_trunc_pd(V) _mm512_roundscale_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm512_roundnearest_pd(V) _mm512_roundscale_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); }
-KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); }
-KFR_SINTRIN f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(*value); }
-KFR_SINTRIN f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(*value); }
-KFR_SINTRIN f64sse floor(const f64sse& value) { return _mm_floor_pd(*value); }
-KFR_SINTRIN f64sse ceil(const f64sse& value) { return _mm_ceil_pd(*value); }
-KFR_SINTRIN f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(*value); }
-KFR_SINTRIN f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(*value); }
-KFR_SINTRIN f32sse fract(const f32sse& x) { return x - floor(x); }
-KFR_SINTRIN f64sse fract(const f64sse& x) { return x - floor(x); }
-
-#if defined CMT_ARCH_AVX
-
-KFR_SINTRIN f32avx floor(const f32avx& value) { return _mm256_floor_ps(*value); }
-KFR_SINTRIN f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(*value); }
-KFR_SINTRIN f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(*value); }
-KFR_SINTRIN f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(*value); }
-KFR_SINTRIN f64avx floor(const f64avx& value) { return _mm256_floor_pd(*value); }
-KFR_SINTRIN f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(*value); }
-KFR_SINTRIN f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(*value); }
-KFR_SINTRIN f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(*value); }
-KFR_SINTRIN f32avx fract(const f32avx& x) { return x - floor(x); }
-KFR_SINTRIN f64avx fract(const f64avx& x) { return x - floor(x); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-
-KFR_SINTRIN f32avx512 floor(const f32avx512& value) { return _mm512_floor_ps(*value); }
-KFR_SINTRIN f32avx512 ceil(const f32avx512& value) { return _mm512_ceil_ps(*value); }
-KFR_SINTRIN f32avx512 trunc(const f32avx512& value) { return KFR_mm512_trunc_ps(*value); }
-KFR_SINTRIN f32avx512 round(const f32avx512& value) { return KFR_mm512_roundnearest_ps(*value); }
-KFR_SINTRIN f64avx512 floor(const f64avx512& value) { return _mm512_floor_pd(*value); }
-KFR_SINTRIN f64avx512 ceil(const f64avx512& value) { return _mm512_ceil_pd(*value); }
-KFR_SINTRIN f64avx512 trunc(const f64avx512& value) { return KFR_mm512_trunc_pd(*value); }
-KFR_SINTRIN f64avx512 round(const f64avx512& value) { return KFR_mm512_roundnearest_pd(*value); }
-KFR_SINTRIN f32avx512 fract(const f32avx512& x) { return x - floor(x); }
-KFR_SINTRIN f64avx512 fract(const f64avx512& x) { return x - floor(x); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_F_1(floor)
-KFR_HANDLE_ALL_SIZES_F_1(ceil)
-KFR_HANDLE_ALL_SIZES_F_1(round)
-KFR_HANDLE_ALL_SIZES_F_1(trunc)
-KFR_HANDLE_ALL_SIZES_F_1(fract)
-
-#else
-
-// fallback
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x)
-{
-    vec<f32, N> t = cast<f32>(cast<i32>(x));
-    return t - select(x < t, 1.f, 0.f);
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x)
-{
-    vec<f64, N> t = cast<f64>(cast<i64>(x));
-    return t - select(x < t, 1., 0.);
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x)
-{
-    vec<f32, N> t = cast<f32>(cast<i32>(x));
-    return t + select(x > t, 1.f, 0.f);
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x)
-{
-    vec<f64, N> t = cast<f64>(cast<i64>(x));
-    return t + select(x > t, 1., 0.);
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x)
-{
-    return cast<f32>(cast<i32>(x + mulsign(broadcast<N>(0.5f), x)));
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> round(const vec<f64, N>& x)
-{
-    return cast<f64>(cast<i64>(x + mulsign(broadcast<N>(0.5), x)));
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> trunc(const vec<f32, N>& x)
-{
-    return cast<f32>(cast<i32>(x));
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> trunc(const vec<f64, N>& x)
-{
-    return cast<f64>(cast<i64>(x));
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> fract(const vec<f32, N>& x)
-{
-    return x - floor(x);
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> fract(const vec<f64, N>& x)
-{
-    return x - floor(x);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> floor(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> ceil(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> trunc(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> round(const vec<T, N>& value)
-{
-    return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fract(const vec<T, N>&)
-{
-    return T(0);
-}
-
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> ifloor(const vec<T, N>& value)
-{
-    return cast<IT>(floor(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> iceil(const vec<T, N>& value)
-{
-    return cast<IT>(ceil(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> itrunc(const vec<T, N>& value)
-{
-    return cast<IT>(trunc(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> iround(const vec<T, N>& value)
-{
-    return cast<IT>(round(value));
-}
-
-KFR_I_CONVERTER(floor)
-KFR_I_CONVERTER(ceil)
-KFR_I_CONVERTER(round)
-KFR_I_CONVERTER(trunc)
-KFR_I_CONVERTER(fract)
-KFR_I_CONVERTER(ifloor)
-KFR_I_CONVERTER(iceil)
-KFR_I_CONVERTER(iround)
-KFR_I_CONVERTER(itrunc)
-} // namespace intrinsics
-KFR_I_FN(floor)
-KFR_I_FN(ceil)
-KFR_I_FN(round)
-KFR_I_FN(trunc)
-KFR_I_FN(fract)
-KFR_I_FN(ifloor)
-KFR_I_FN(iceil)
-KFR_I_FN(iround)
-KFR_I_FN(itrunc)
-
-} // namespace kfr
-
-#undef KFR_mm_trunc_ps
-#undef KFR_mm_roundnearest_ps
-#undef KFR_mm_trunc_pd
-#undef KFR_mm_roundnearest_pd
-#undef KFR_mm_trunc_ss
-#undef KFR_mm_roundnearest_ss
-#undef KFR_mm_trunc_sd
-#undef KFR_mm_roundnearest_sd
-#undef KFR_mm_floor_ss
-#undef KFR_mm_floor_sd
-#undef KFR_mm_ceil_ss
-#undef KFR_mm_ceil_sd
-#undef KFR_mm256_trunc_ps
-#undef KFR_mm256_roundnearest_ps
-#undef KFR_mm256_trunc_pd
-#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/base/impl/saturation.hpp b/include/kfr/base/impl/saturation.hpp
@@ -1,192 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-// Generic functions
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
-{
-    using UT               = utype<T>;
-    constexpr size_t shift = typebits<UT>::bits - 1;
-    vec<UT, N> aa          = bitcast<UT>(a);
-    vec<UT, N> bb          = bitcast<UT>(b);
-    const vec<UT, N> sum   = aa + bb;
-    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
-    return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= 0, a, bitcast<T>(sum));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    using UT               = utype<T>;
-    constexpr size_t shift = typebits<UT>::bits - 1;
-    vec<UT, N> aa          = bitcast<UT>(a);
-    vec<UT, N> bb          = bitcast<UT>(b);
-    const vec<UT, N> diff  = aa - bb;
-    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
-    return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < 0, a, bitcast<T>(diff));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
-{
-    const vec<T, N> t = allonesvector(a);
-    return select(a > t - b, t, a + b);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return select(a < b, zerovector(a), a - b);
-}
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); }
-KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); }
-KFR_SINTRIN u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(*x, *y); }
-KFR_SINTRIN i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(*x, *y); }
-
-KFR_SINTRIN u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(*x, *y); }
-KFR_SINTRIN i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(*x, *y); }
-KFR_SINTRIN u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(*x, *y); }
-KFR_SINTRIN i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(*x, *y); }
-
-KFR_SINTRIN i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
-
-KFR_SINTRIN i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
-KFR_SINTRIN u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(*x, *y); }
-KFR_SINTRIN i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(*x, *y); }
-KFR_SINTRIN u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(*x, *y); }
-KFR_SINTRIN i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(*x, *y); }
-
-KFR_SINTRIN u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(*x, *y); }
-KFR_SINTRIN i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(*x, *y); }
-KFR_SINTRIN u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(*x, *y); }
-KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(*x, *y); }
-
-KFR_SINTRIN i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
-
-KFR_SINTRIN i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
-KFR_SINTRIN u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(*x, *y); }
-KFR_SINTRIN i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(*x, *y); }
-KFR_SINTRIN i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(*x, *y); }
-KFR_SINTRIN u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(*x, *y); }
-KFR_SINTRIN i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(*x, *y); }
-KFR_SINTRIN i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(*x, *y); }
-
-KFR_SINTRIN i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN u32avx512 satadd(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN u64avx512 satadd(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN u32avx512 satsub(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_sub(a, b); }
-KFR_SINTRIN u64avx512 satsub(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_sub(a, b); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); }
-KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); }
-KFR_SINTRIN u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(*x, *y); }
-KFR_SINTRIN i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(*x, *y); }
-KFR_SINTRIN u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(*a, *b); }
-KFR_SINTRIN i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(*a, *b); }
-KFR_SINTRIN u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(*a, *b); }
-KFR_SINTRIN i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(*a, *b); }
-
-KFR_SINTRIN u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(*x, *y); }
-KFR_SINTRIN i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(*x, *y); }
-KFR_SINTRIN u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(*x, *y); }
-KFR_SINTRIN i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(*x, *y); }
-KFR_SINTRIN u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(*a, *b); }
-KFR_SINTRIN i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(*a, *b); }
-KFR_SINTRIN u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(*a, *b); }
-KFR_SINTRIN i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(*a, *b); }
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#else
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_signed_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_unsigned_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_signed_sub(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return saturated_unsigned_sub(a, b);
-}
-#endif
-KFR_I_CONVERTER(satadd)
-KFR_I_CONVERTER(satsub)
-} // namespace intrinsics
-KFR_I_FN(satadd)
-KFR_I_FN(satsub)
-} // namespace kfr
diff --git a/include/kfr/base/impl/select.hpp b/include/kfr/base/impl/select.hpp
@@ -1,261 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-
-namespace kfr
-{
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN u8sse select(const maskfor<u8sse>& m, const u8sse& x, const u8sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u16sse select(const maskfor<u16sse>& m, const u16sse& x, const u16sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u32sse select(const maskfor<u32sse>& m, const u32sse& x, const u32sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u64sse select(const maskfor<u64sse>& m, const u64sse& x, const u64sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i8sse select(const maskfor<i8sse>& m, const i8sse& x, const i8sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i16sse select(const maskfor<i16sse>& m, const i16sse& x, const i16sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i32sse select(const maskfor<i32sse>& m, const i32sse& x, const i32sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i64sse select(const maskfor<i64sse>& m, const i64sse& x, const i64sse& y)
-{
-    return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN f32sse select(const maskfor<f32sse>& m, const f32sse& x, const f32sse& y)
-{
-    return _mm_blendv_ps(*y, *x, *m);
-}
-KFR_SINTRIN f64sse select(const maskfor<f64sse>& m, const f64sse& x, const f64sse& y)
-{
-    return _mm_blendv_pd(*y, *x, *m);
-}
-
-#if defined CMT_ARCH_AVX
-KFR_SINTRIN f64avx select(const maskfor<f64avx>& m, const f64avx& x, const f64avx& y)
-{
-    return _mm256_blendv_pd(*y, *x, *m);
-}
-KFR_SINTRIN f32avx select(const maskfor<f32avx>& m, const f32avx& x, const f32avx& y)
-{
-    return _mm256_blendv_ps(*y, *x, *m);
-}
-#endif
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN u8avx select(const maskfor<u8avx>& m, const u8avx& x, const u8avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u16avx select(const maskfor<u16avx>& m, const u16avx& x, const u16avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u32avx select(const maskfor<u32avx>& m, const u32avx& x, const u32avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u64avx select(const maskfor<u64avx>& m, const u64avx& x, const u64avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i8avx select(const maskfor<i8avx>& m, const i8avx& x, const i8avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i16avx select(const maskfor<i16avx>& m, const i16avx& x, const i16avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i32avx select(const maskfor<i32avx>& m, const i32avx& x, const i32avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i64avx select(const maskfor<i64avx>& m, const i64avx& x, const i64avx& y)
-{
-    return _mm256_blendv_epi8(*y, *x, *m);
-}
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN f64avx512 select(const maskfor<f64avx512>& m, const f64avx512& x, const f64avx512& y)
-{
-    return _mm512_mask_blend_pd(_mm512_test_epi64_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN f32avx512 select(const maskfor<f32avx512>& m, const f32avx512& x, const f32avx512& y)
-{
-    return _mm512_mask_blend_ps(_mm512_test_epi32_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u8avx512 select(const maskfor<u8avx512>& m, const u8avx512& x, const u8avx512& y)
-{
-    return _mm512_mask_blend_epi8(_mm512_test_epi8_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u16avx512 select(const maskfor<u16avx512>& m, const u16avx512& x, const u16avx512& y)
-{
-    return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u32avx512 select(const maskfor<u32avx512>& m, const u32avx512& x, const u32avx512& y)
-{
-    return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u64avx512 select(const maskfor<u64avx512>& m, const u64avx512& x, const u64avx512& y)
-{
-    return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i8avx512 select(const maskfor<i8avx512>& m, const i8avx512& x, const i8avx512& y)
-{
-    return _mm512_mask_blend_epi8(_mm512_test_epi8_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i16avx512 select(const maskfor<i16avx512>& m, const i16avx512& x, const i16avx512& y)
-{
-    return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i32avx512 select(const maskfor<i32avx512>& m, const i32avx512& x, const i32avx512& y)
-{
-    return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, const i64avx512& y)
-{
-    return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    return concat(select(low(a.asvec()).asmask(), low(b), low(c)),
-                  select(high(a.asvec()).asmask(), high(b), high(c)));
-}
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32neon select(const maskfor<f32neon>& m, const f32neon& x, const f32neon& y)
-{
-    return vbslq_f32(*m, *x, *y);
-}
-
-KFR_SINTRIN i8neon select(const maskfor<i8neon>& m, const i8neon& x, const i8neon& y)
-{
-    return vbslq_s8(*m, *x, *y);
-}
-KFR_SINTRIN u8neon select(const maskfor<u8neon>& m, const u8neon& x, const u8neon& y)
-{
-    return vbslq_u8(*m, *x, *y);
-}
-KFR_SINTRIN i16neon select(const maskfor<i16neon>& m, const i16neon& x, const i16neon& y)
-{
-    return vbslq_s16(*m, *x, *y);
-}
-KFR_SINTRIN u16neon select(const maskfor<u16neon>& m, const u16neon& x, const u16neon& y)
-{
-    return vbslq_u16(*m, *x, *y);
-}
-KFR_SINTRIN i32neon select(const maskfor<i32neon>& m, const i32neon& x, const i32neon& y)
-{
-    return vbslq_s32(*m, *x, *y);
-}
-KFR_SINTRIN u32neon select(const maskfor<u32neon>& m, const u32neon& x, const u32neon& y)
-{
-    return vbslq_u32(*m, *x, *y);
-}
-KFR_SINTRIN i64neon select(const maskfor<i64neon>& m, const i64neon& x, const i64neon& y)
-{
-    return vbslq_s64(*m, *x, *y);
-}
-KFR_SINTRIN u64neon select(const maskfor<u64neon>& m, const u64neon& x, const u64neon& y)
-{
-    return vbslq_u64(*m, *x, *y);
-}
-
-#ifdef CMT_ARCH_NEON64
-KFR_SINTRIN f64neon select(const maskfor<f64neon>& m, const f64neon& x, const f64neon& y)
-{
-    return vbslq_f64(*m, *x, *y);
-}
-#else
-KFR_SINTRIN f64neon select(const maskfor<f64neon>& m, const f64neon& x, const f64neon& y)
-{
-    return y ^ ((x ^ y) & m.asvec());
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
-    return concat(select(low(a.asvec()).asmask(), low(b), low(c)),
-                  select(high(a.asvec()).asmask(), high(b), high(c)));
-}
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
-{
-    return y ^ ((x ^ y) & m.asvec());
-}
-#endif
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
-{
-    return select(m.asmask(), x, y);
-}
-} // namespace intrinsics
-KFR_I_FN(select)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/sin_cos.hpp b/include/kfr/base/impl/sin_cos.hpp
@@ -1,338 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../min_max.hpp"
-#include "../operators.hpp"
-#include "../round.hpp"
-#include "../select.hpp"
-#include "../shuffle.hpp"
-
-#if CMT_HAS_WARNING("-Wc99-extensions")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
-#endif
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0)
-{
-    return select(msk, a0, b0);
-}
-
-template <typename T, size_t N, typename... Ts>
-KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0,
-                                  const T& a1, const T& b1, const Ts&... values)
-{
-    return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0));
-}
-
-template <typename T, size_t N, typename Tprecise = f64>
-KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
-{
-    const vec<T, N> xabs = abs(x);
-    constexpr T div      = constants<T>::fold_constant_div;
-    vec<T, N> y          = floor(xabs / div);
-    quadrant             = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
-
-    const mask<T, N> msk = (quadrant & 1) != 0;
-    quadrant             = kfr::select(msk, quadrant + 1, quadrant);
-    y                    = select(msk, y + T(1.0), y);
-    quadrant             = quadrant & 7;
-
-    constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi);
-    constexpr T rem1      = constants<T>::fold_constant_rem1;
-    constexpr T rem2      = constants<T>::fold_constant_rem2;
-    return cast<T>(cast<Tprecise>(xabs) - cast<Tprecise>(y) * hi) - y * rem1 - y * rem2;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask)
-{
-    constexpr f32 sin_c2  = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f);
-    constexpr f32 sin_c4  = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f);
-    constexpr f32 sin_c6  = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f);
-    constexpr f32 sin_c8  = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f);
-    constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f);
-    constexpr f32 cos_c2  = CMT_FP(-0x8.p-4f, -5.0000000000e-01f);
-    constexpr f32 cos_c4  = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f);
-    constexpr f32 cos_c6  = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f);
-    constexpr f32 cos_c8  = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f);
-    constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f);
-
-    const vec<f32, N> x2 = folded * folded;
-
-    vec<f32, N> formula = trig_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6,
-                                      cos_c8, sin_c8, cos_c10, sin_c10);
-
-    formula = select(cosmask, formula, formula * folded);
-    return formula;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask)
-{
-    constexpr f64 sin_c2  = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01);
-    constexpr f64 sin_c4  = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03);
-    constexpr f64 sin_c6  = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04);
-    constexpr f64 sin_c8  = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06);
-    constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08);
-    constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10);
-    constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13);
-    constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14);
-    constexpr f64 cos_c2  = CMT_FP(-0x8.p-4, -5.000000000000000000e-01);
-    constexpr f64 cos_c4  = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02);
-    constexpr f64 cos_c6  = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03);
-    constexpr f64 cos_c8  = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05);
-    constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07);
-    constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09);
-    constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11);
-    constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14);
-
-    vec<f64, N> x2 = folded * folded;
-    vec<f64, N> formula =
-        trig_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8,
-                    cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16);
-
-    formula = select(cosmask, formula, formula * folded);
-    return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
-KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask)
-{
-    vec<itype<T>, N> quadrant;
-    vec<T, N> folded = trig_fold(x_full, quadrant);
-
-    mask<T, N> flip_sign =
-        kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask();
-
-    mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
-    usecos            = usecos ^ cosmask;
-
-    vec<T, N> formula = trig_sincos(folded, usecos);
-
-    mask<T, N> negmask = x_full < 0;
-
-    flip_sign = flip_sign ^ (negmask & ~cosmask);
-
-    formula = select(flip_sign, -formula, formula);
-    return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x)
-{
-    vec<itype<T>, N> quadrant;
-    vec<T, N> folded = trig_fold(x, quadrant);
-
-    mask<T, N> flip_sign = quadrant >= 4;
-    mask<T, N> usecos    = (quadrant == 2) || (quadrant == 6);
-
-    vec<T, N> formula = trig_sincos(folded, usecos);
-
-    formula = select(flip_sign ^ mask<T, N>(x), -formula, formula);
-    return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> cos(const vec<T, N>& x)
-{
-    vec<itype<T>, N> quadrant;
-    vec<T, N> folded = trig_fold(x, quadrant);
-
-    mask<T, N> eq4       = (quadrant == 4);
-    mask<T, N> flip_sign = (quadrant == 2) || eq4;
-    mask<T, N> usecos    = (quadrant == 0) || eq4;
-
-    vec<T, N> formula = trig_sincos(folded, usecos);
-
-    formula = select(flip_sign, -formula, formula);
-    return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x)
-{
-    const vec<T, N> msk = broadcast<N>(constants<T>::highbitmask());
-
-    constexpr static T c2 = -0.16665853559970855712890625;
-    constexpr static T c4 = +8.31427983939647674560546875e-3;
-    constexpr static T c6 = -1.85423981747590005397796630859375e-4;
-
-    const vec<T, N> pi = c_pi<T>;
-
-    vec<T, N> xx = x - pi;
-    vec<T, N> y  = abs(xx);
-    y            = select(y > c_pi<T, 1, 2>, pi - y, y);
-    y            = y ^ (msk & ~xx);
-
-    vec<T, N> y2      = y * y;
-    vec<T, N> formula = c6;
-    vec<T, N> y3      = y2 * y;
-    formula           = fmadd(formula, y2, c4);
-    formula           = fmadd(formula, y2, c2);
-    formula           = formula * y3 + y;
-    return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fastcos(const vec<T, N>& x)
-{
-    x += c_pi<T, 1, 2>;
-    x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x);
-    return fastsin(x);
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sincos(const vec<T, N>& x)
-{
-    return sincos_mask(x, internal::oddmask<T, N>());
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> cossin(const vec<T, N>& x)
-{
-    return sincos_mask(x, internal::evenmask<T, N>());
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sinc(const vec<T, N>& x)
-{
-    return select(abs(x) <= constants<T>::epsilon, T(1), sin(x) / x);
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sin(const vec<T, N>& x)
-{
-    return sin(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cos(const vec<T, N>& x)
-{
-    return cos(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> fastsin(const vec<T, N>& x)
-{
-    return fastsin(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> fastcos(const vec<T, N>& x)
-{
-    return fastcos(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sincos(const vec<T, N>& x)
-{
-    return sincos(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cossin(const vec<T, N>& x)
-{
-    return cossin(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sinc(const vec<T, N>& x)
-{
-    return sinc(cast<Tout>(x));
-}
-
-KFR_I_FLT_CONVERTER(sin)
-KFR_I_FLT_CONVERTER(cos)
-KFR_I_FLT_CONVERTER(fastsin)
-KFR_I_FLT_CONVERTER(fastcos)
-KFR_I_FLT_CONVERTER(sincos)
-KFR_I_FLT_CONVERTER(cossin)
-KFR_I_FLT_CONVERTER(sinc)
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout sindeg(const T& x)
-{
-    return sin(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout cosdeg(const T& x)
-{
-    return cos(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout fastsindeg(const T& x)
-{
-    return fastsin(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout fastcosdeg(const T& x)
-{
-    return fastcos(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout sincosdeg(const T& x)
-{
-    return sincos(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout cossindeg(const T& x)
-{
-    return cossin(x * constants<Tout>::degtorad);
-}
-} // namespace intrinsics
-
-KFR_I_FN(sin)
-KFR_I_FN(cos)
-KFR_I_FN(fastsin)
-KFR_I_FN(fastcos)
-KFR_I_FN(sincos)
-KFR_I_FN(cossin)
-
-KFR_I_FN(sindeg)
-KFR_I_FN(cosdeg)
-KFR_I_FN(fastsindeg)
-KFR_I_FN(fastcosdeg)
-KFR_I_FN(sincosdeg)
-KFR_I_FN(cossindeg)
-
-KFR_I_FN(sinc)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/sqrt.hpp b/include/kfr/base/impl/sqrt.hpp
@@ -1,71 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(f32x4(_mm_sqrt_ss(*extend<4>(x)))); }
-KFR_SINTRIN f64x1 sqrt(const f64x1& x)
-{
-    return slice<0, 1>(f64x2(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x))));
-}
-KFR_SINTRIN f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(*x); }
-KFR_SINTRIN f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(*x); }
-
-#if defined CMT_ARCH_AVX
-KFR_SINTRIN f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(*x); }
-KFR_SINTRIN f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(*x); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN f32avx512 sqrt(const f32avx512& x) { return _mm512_sqrt_ps(*x); }
-KFR_SINTRIN f64avx512 sqrt(const f64avx512& x) { return _mm512_sqrt_pd(*x); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_FLT_1(sqrt)
-
-#else
-
-// fallback
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sqrt(const vec<T, N>& x)
-{
-    return apply([](Tout x) { return std::sqrt(x); }, cast<Tout>(x));
-}
-#endif
-KFR_I_FLT_CONVERTER(sqrt)
-} // namespace intrinsics
-KFR_I_FN(sqrt)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/tan.hpp b/include/kfr/base/impl/tan.hpp
@@ -1,141 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-#include "../sin_cos.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse)
-{
-    constexpr T pi_14 = c_pi<T, 1, 4>;
-
-    vec<T, N> y      = abs(x_full);
-    vec<T, N> scaled = y / pi_14;
-
-    vec<T, N> k_real = floor(scaled);
-    vec<IT, N> k     = cast<IT>(k_real);
-
-    vec<T, N> x = y - k_real * pi_14;
-
-    mask<T, N> need_offset = (k & 1) != 0;
-    x                      = select(need_offset, x - pi_14, x);
-
-    vec<IT, N> k_mod4 = k & 3;
-    inverse           = (k_mod4 == 1) || (k_mod4 == 2);
-    return x;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full)
-{
-    mask<f32, N> inverse;
-    vec<i32, N> quad;
-    const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
-    inverse             = quad == 2 || quad == 6;
-
-    constexpr f32 tan_c2  = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01);
-    constexpr f32 tan_c4  = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01);
-    constexpr f32 tan_c6  = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02);
-    constexpr f32 tan_c8  = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02);
-    constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03);
-    constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03);
-
-    constexpr f32 cot_c2  = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01);
-    constexpr f32 cot_c4  = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02);
-    constexpr f32 cot_c6  = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03);
-    constexpr f32 cot_c8  = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04);
-    constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05);
-    constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07);
-
-    const vec<f32, N> x2  = x * x;
-    const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6,
-                                        tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12);
-
-    const vec<f32, N> z = select(inverse, val / -x, val * x);
-    return mulsign(z, x_full);
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full)
-{
-    mask<f64, N> inverse;
-    vec<i64, N> quad;
-    const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
-    inverse             = quad == 2 || quad == 6;
-
-    constexpr f64 tan_c2  = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01);
-    constexpr f64 tan_c4  = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01);
-    constexpr f64 tan_c6  = CMT_FP(0xd.d0d90de32b3e8p-8, 5.396801556632355862e-02);
-    constexpr f64 tan_c8  = CMT_FP(0x5.99723bdcf5cacp-8, 2.187265359403693307e-02);
-    constexpr f64 tan_c10 = CMT_FP(0x2.434a142e413ap-8, 8.839254309582239566e-03);
-    constexpr f64 tan_c12 = CMT_FP(0xf.2b59061305efp-12, 3.703449009834865711e-03);
-    constexpr f64 tan_c14 = CMT_FP(0x4.a12565071a664p-12, 1.130243370829653185e-03);
-    constexpr f64 tan_c16 = CMT_FP(0x4.dada3797ac1bcp-12, 1.185276423238536747e-03);
-    constexpr f64 tan_c18 = CMT_FP(-0x1.a74976b6ea3f3p-12, -4.036779095551438937e-04);
-    constexpr f64 tan_c20 = CMT_FP(0x1.d06a5ae5e4a74p-12, 4.429010863244216712e-04);
-
-    constexpr f64 cot_c2  = CMT_FP(-0x5.5555555555554p-4, -3.333333333333333148e-01);
-    constexpr f64 cot_c4  = CMT_FP(-0x5.b05b05b05b758p-8, -2.222222222222377391e-02);
-    constexpr f64 cot_c6  = CMT_FP(-0x8.ab355dffc79a8p-12, -2.116402116358796163e-03);
-    constexpr f64 cot_c8  = CMT_FP(-0xd.debbca405c9f8p-16, -2.116402122295888289e-04);
-    constexpr f64 cot_c10 = CMT_FP(-0x1.66a8edb99b15p-16, -2.137779458737224013e-05);
-    constexpr f64 cot_c12 = CMT_FP(-0x2.450239be0ee92p-20, -2.164426049513111728e-06);
-    constexpr f64 cot_c14 = CMT_FP(-0x3.ad6ddb4719438p-24, -2.191935496317727080e-07);
-    constexpr f64 cot_c16 = CMT_FP(-0x5.ff4c42741356p-28, -2.234152473099993830e-08);
-    constexpr f64 cot_c18 = CMT_FP(-0x9.06881bcdf3108p-32, -2.101416316020595077e-09);
-    constexpr f64 cot_c20 = CMT_FP(-0x1.644abedc113cap-32, -3.240456633529511097e-10);
-
-    const vec<f64, N> x2  = x * x;
-    const vec<f64, N> val = trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6,
-                                        cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14,
-                                        cot_c16, tan_c16, cot_c18, tan_c18, cot_c20, tan_c20);
-
-    const vec<f64, N> z = select(inverse, val / -x, val * x);
-    return mulsign(z, x_full);
-}
-
-KFR_I_FLT_CONVERTER(tan)
-template <typename T>
-KFR_SINTRIN flt_type<T> tandeg(const T& x)
-{
-    return tan(x * c_degtorad<flt_type<T>>);
-}
-} // namespace intrinsics
-KFR_I_FN(tan)
-KFR_I_FN(tandeg)
-
-} // namespace kfr
diff --git a/include/kfr/base/intrinsics.h b/include/kfr/base/intrinsics.h
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "kfr.h"
-
-#ifdef CMT_ARCH_SSE2
-#include <immintrin.h>
-#ifdef CMT_OS_WIN
-#include <intrin.h>
-#endif
-#endif
-
-#ifdef CMT_ARCH_NEON
-#include <arm_neon.h>
-#endif
-
-#if defined CMT_COMPILER_GCC && defined CMT_ARCH_X86
-#include <x86intrin.h>
-#endif
diff --git a/include/kfr/base/kfr.h b/include/kfr/base/kfr.h
@@ -1,46 +0,0 @@
-/** @addtogroup utility
- *  @{
- */
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "../cident.h"
-
-#define KFR_VERSION_MAJOR 3
-#define KFR_VERSION_MINOR 0
-#define KFR_VERSION_BUILD 4
-#define KFR_VERSION_STRING                                                                                   \
-    CMT_STRINGIFY(KFR_VERSION_MAJOR) "." CMT_STRINGIFY(KFR_VERSION_MINOR) "." CMT_STRINGIFY(KFR_VERSION_BUILD)
-#define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_BUILD)
-
-#ifdef CMT_ARCH_X64
-#define KFR_VERSION_FULL                                                                                     \
-    "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 64-bit (" CMT_COMPIER_NAME "/" CMT_OS_NAME  \
-                                                               ")"
-#else
-#define KFR_VERSION_FULL                                                                                     \
-    "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 32-bit (" CMT_COMPIER_NAME "/" CMT_OS_NAME  \
-                                                               ")"
-#endif
-
-#ifdef __cplusplus
-namespace kfr
-{
-/// @brief KFR version string
-constexpr const char version_string[] = KFR_VERSION_STRING;
-
-constexpr int version_major = KFR_VERSION_MAJOR;
-constexpr int version_minor = KFR_VERSION_MINOR;
-constexpr int version_build = KFR_VERSION_BUILD;
-constexpr int version       = KFR_VERSION;
-
-/// @brief KFR version string including architecture and compiler name
-constexpr const char version_full[] = KFR_VERSION_FULL;
-} // namespace kfr
-#endif
-
-#define KFR_INTRIN CMT_INTRIN
-#define KFR_FUNC CMT_FUNC
-#define KFR_SINTRIN CMT_INTRIN static
diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp
@@ -1,229 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/log_exp.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns e raised to the given power x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> exp(const T1& x)
-{
-    return intrinsics::exp(x);
-}
-
-/// @brief Returns e raised to the given power x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::exp, E1> exp(E1&& x)
-{
-    return { fn::exp(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 2 raised to the given power x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> exp2(const T1& x)
-{
-    return intrinsics::exp2(x);
-}
-
-/// @brief Returns 2 raised to the given power x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::exp2, E1> exp2(E1&& x)
-{
-    return { fn::exp2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 10 raised to the given power x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> exp10(const T1& x)
-{
-    return intrinsics::exp10(x);
-}
-
-/// @brief Returns 10 raised to the given power x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::exp10, E1> exp10(E1&& x)
-{
-    return { fn::exp10(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the natural logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> log(const T1& x)
-{
-    return intrinsics::log(x);
-}
-
-/// @brief Returns the natural logarithm of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::log, E1> log(E1&& x)
-{
-    return { fn::log(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the binary (base-2) logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> log2(const T1& x)
-{
-    return intrinsics::log2(x);
-}
-
-/// @brief Returns the binary (base-2) logarithm of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::log2, E1> log2(E1&& x)
-{
-    return { fn::log2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the common (base-10) logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> log10(const T1& x)
-{
-    return intrinsics::log10(x);
-}
-
-/// @brief Returns the common (base-10) logarithm of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::log10, E1> log10(E1&& x)
-{
-    return { fn::log10(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the rounded binary (base-2) logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> logb(const T1& x)
-{
-    return intrinsics::logb(x);
-}
-
-/// @brief Returns the rounded binary (base-2) logarithm of the x. Version that accepts and returns
-/// expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::logb, E1> logb(E1&& x)
-{
-    return { fn::logb(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the logarithm of the x with base y.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> logn(const T1& x, const T2& y)
-{
-    return intrinsics::logn(x, y);
-}
-
-/// @brief Returns the logarithm of the x with base y. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::logn, E1, E2> logn(E1&& x, E2&& y)
-{
-    return { fn::logn(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns log(x) * y.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> logm(const T1& x, const T2& y)
-{
-    return intrinsics::logm(x, y);
-}
-
-/// @brief Returns log(x) * y. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::logm, E1, E2> logm(E1&& x, E2&& y)
-{
-    return { fn::logm(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns exp(x * m + a).
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_FUNC flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& y, const T3& z)
-{
-    return intrinsics::exp_fmadd(x, y, z);
-}
-
-/// @brief Returns exp(x * m + a). Version that accepts and returns expressions.
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_FUNC internal::expression_function<fn::exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& y, E3&& z)
-{
-    return { fn::exp_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
-}
-
-/// @brief Returns log(x) * m + a.
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_FUNC flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& y, const T3& z)
-{
-    return intrinsics::log_fmadd(x, y, z);
-}
-
-/// @brief Returns log(x) * m + a. Version that accepts and returns expressions.
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_FUNC internal::expression_function<fn::log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& y, E3&& z)
-{
-    return { fn::log_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
-}
-
-/// @brief Returns the x raised to the given power y.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> pow(const T1& x, const T2& y)
-{
-    return intrinsics::pow(x, y);
-}
-
-/// @brief Returns the x raised to the given power y. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::pow, E1, E2> pow(E1&& x, E2&& y)
-{
-    return { fn::pow(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns the real nth root of the x.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> root(const T1& x, const T2& y)
-{
-    return intrinsics::root(x, y);
-}
-
-/// @brief Returns the real nth root of the x. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::root, E1, E2> root(E1&& x, E2&& y)
-{
-    return { fn::root(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns the cube root of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cbrt(const T1& x)
-{
-    return intrinsics::cbrt(x);
-}
-
-/// @brief Returns the cube root of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cbrt, E1> cbrt(E1&& x)
-{
-    return { fn::cbrt(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -1,50 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/logical.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns x[0] && x[1] && ... && x[N-1]
- */
-template <typename T, size_t N>
-KFR_SINTRIN bool all(const mask<T, N>& x)
-{
-    return intrinsics::bittestall(x.asvec());
-}
-
-/**
- * @brief Returns x[0] || x[1] || ... || x[N-1]
- */
-template <typename T, size_t N>
-KFR_SINTRIN bool any(const mask<T, N>& x)
-{
-    return intrinsics::bittestany(x.asvec());
-}
-} // namespace kfr
diff --git a/include/kfr/base/memory.hpp b/include/kfr/base/memory.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup utility
+/** @addtogroup memory
  *  @{
  */
 /*
@@ -25,8 +25,8 @@
  */
 #pragma once
 
-#include "read_write.hpp"
-#include "types.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/types.hpp"
 #include <algorithm>
 #include <atomic>
 #include <memory>
@@ -34,7 +34,7 @@
 namespace kfr
 {
 
-namespace internal
+namespace internal_generic
 {
 
 struct memory_statistics
@@ -51,6 +51,8 @@ inline memory_statistics& get_memory_statistics()
     return ms;
 }
 
+#pragma pack(push, 1)
+
 struct mem_header
 {
     u8 offset;
@@ -60,13 +62,18 @@ struct mem_header
     unsigned int references_uint;
     size_t size;
 
-    CMT_INLINE std::atomic_uint& references() { return reinterpret_cast<std::atomic_uint&>(references_uint); }
+    KFR_MEM_INTRINSIC std::atomic_uint& references()
+    {
+        return reinterpret_cast<std::atomic_uint&>(references_uint);
+    }
 }
 #ifdef CMT_GNU_ATTRIBUTES
 __attribute__((__packed__))
 #endif
 ;
 
+#pragma pack(pop)
+
 inline mem_header* aligned_header(void* ptr) { return ptr_cast<mem_header>(ptr) - 1; }
 
 inline size_t aligned_size(void* ptr) { return aligned_header(ptr)->size; }
@@ -103,58 +110,58 @@ inline void aligned_free(void* ptr)
 }
 
 inline void aligned_release(void* ptr) { aligned_free(ptr); }
-} // namespace internal
+} // namespace internal_generic
 
 /// @brief Allocates aligned memory
 template <typename T = void, size_t alignment = platform<>::native_cache_alignment>
-CMT_INLINE T* aligned_allocate(size_t size = 1)
+KFR_INTRINSIC T* aligned_allocate(size_t size = 1)
 {
     T* ptr = static_cast<T*>(CMT_ASSUME_ALIGNED(
-        internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment),
+        internal_generic::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment),
         alignment));
     return ptr;
 }
 
 /// @brief Deallocates aligned memory
 template <typename T = void>
-CMT_INLINE void aligned_deallocate(T* ptr)
+KFR_INTRINSIC void aligned_deallocate(T* ptr)
 {
-    return internal::aligned_free(ptr);
+    return internal_generic::aligned_free(ptr);
 }
 
-namespace internal
+namespace internal_generic
 {
 template <typename T>
 struct aligned_deleter
 {
-    CMT_INLINE void operator()(T* ptr) const { aligned_deallocate(ptr); }
+    KFR_MEM_INTRINSIC void operator()(T* ptr) const { aligned_deallocate(ptr); }
 };
-} // namespace internal
+} // namespace internal_generic
 
 template <typename T>
 struct autofree
 {
-    CMT_INLINE autofree() {}
-    explicit CMT_INLINE autofree(size_t size) : ptr(aligned_allocate<T>(size)) {}
+    KFR_MEM_INTRINSIC autofree() {}
+    explicit KFR_MEM_INTRINSIC autofree(size_t size) : ptr(aligned_allocate<T>(size)) {}
     autofree(const autofree&) = delete;
     autofree& operator=(const autofree&) = delete;
-    autofree(autofree&&) noexcept        = default;
-    autofree& operator=(autofree&&) noexcept = default;
-    CMT_INLINE T& operator[](size_t index) noexcept { return ptr[index]; }
-    CMT_INLINE const T& operator[](size_t index) const noexcept { return ptr[index]; }
+    autofree(autofree&&) CMT_NOEXCEPT    = default;
+    autofree& operator=(autofree&&) CMT_NOEXCEPT = default;
+    KFR_MEM_INTRINSIC T& operator[](size_t index) CMT_NOEXCEPT { return ptr[index]; }
+    KFR_MEM_INTRINSIC const T& operator[](size_t index) const CMT_NOEXCEPT { return ptr[index]; }
 
     template <typename U = T>
-    CMT_INLINE U* data() noexcept
+    KFR_MEM_INTRINSIC U* data() CMT_NOEXCEPT
     {
         return ptr_cast<U>(ptr.get());
     }
     template <typename U = T>
-    CMT_INLINE const U* data() const noexcept
+    KFR_MEM_INTRINSIC const U* data() const CMT_NOEXCEPT
     {
         return ptr_cast<U>(ptr.get());
     }
 
-    std::unique_ptr<T[], internal::aligned_deleter<T>> ptr;
+    std::unique_ptr<T[], internal_generic::aligned_deleter<T>> ptr;
 };
 
 #ifdef KFR_USE_STD_ALLOCATION
@@ -181,14 +188,14 @@ struct allocator
     {
         using other = allocator<U>;
     };
-    constexpr allocator() noexcept                 = default;
-    constexpr allocator(const allocator&) noexcept = default;
+    constexpr allocator() CMT_NOEXCEPT                 = default;
+    constexpr allocator(const allocator&) CMT_NOEXCEPT = default;
     template <typename U>
-    constexpr allocator(const allocator<U>&) noexcept
+    constexpr allocator(const allocator<U>&) CMT_NOEXCEPT
     {
     }
-    pointer address(reference x) const noexcept { return std::addressof(x); }
-    const_pointer address(const_reference x) const noexcept { return std::addressof(x); }
+    pointer address(reference x) const CMT_NOEXCEPT { return std::addressof(x); }
+    const_pointer address(const_reference x) const CMT_NOEXCEPT { return std::addressof(x); }
     pointer allocate(size_type n, std::allocator<void>::const_pointer = 0) const
     {
         pointer result = aligned_allocate<value_type>(n);
@@ -211,12 +218,12 @@ struct allocator
 };
 
 template <typename T1, typename T2>
-constexpr inline bool operator==(const allocator<T1>&, const allocator<T2>&) noexcept
+constexpr inline bool operator==(const allocator<T1>&, const allocator<T2>&) CMT_NOEXCEPT
 {
     return true;
 }
 template <typename T1, typename T2>
-constexpr inline bool operator!=(const allocator<T1>&, const allocator<T2>&) noexcept
+constexpr inline bool operator!=(const allocator<T1>&, const allocator<T2>&) CMT_NOEXCEPT
 {
     return false;
 }
@@ -243,4 +250,5 @@ public:                                                                         
                                                                                                              \
 private:                                                                                                     \
     mutable std::atomic_uintptr_t m_refcount = ATOMIC_VAR_INIT(0);
+
 } // namespace kfr
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -1,107 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/min_max.hpp"
-
-namespace kfr
-{
-/**
- * @brief Returns the smaller of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
-          typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout min(const T1& x, const T2& y)
-{
-    return intrinsics::min(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the smaller of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::min, E1, E2> min(E1&& x, E2&& y)
-{
-    return { fn::min(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the greater of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
-          typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout max(const T1& x, const T2& y)
-{
-    return intrinsics::max(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the greater of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::max, E1, E2> max(E1&& x, E2&& y)
-{
-    return { fn::max(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the smaller in magnitude of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
-          typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout absmin(const T1& x, const T2& y)
-{
-    return intrinsics::absmin(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the smaller in magnitude of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::absmin, E1, E2> absmin(E1&& x, E2&& y)
-{
-    return { fn::absmin(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the greater in magnitude of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
-          typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout absmax(const T1& x, const T2& y)
-{
-    return intrinsics::absmax(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the greater in magnitude of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::absmax, E1, E2> absmax(E1&& x, E2&& y)
-{
-    return { fn::absmax(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp
@@ -1,44 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/modzerobessel.hpp"
-
-namespace kfr
-{
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 modzerobessel(const T1& x)
-{
-    return intrinsics::modzerobessel(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1&& x)
-{
-    return { fn::modzerobessel(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp
@@ -1,552 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "bitwise.hpp"
-#include "function.hpp"
-#include <algorithm>
-#include <utility>
-
-namespace kfr
-{
-
-template <typename T>
-constexpr inline T add(const T& x)
-{
-    return x;
-}
-
-/**
- * @brief Returns sum of all the arguments passed to a function.
- */
-template <typename T1, typename T2, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, T2, Ts...>::value)>
-constexpr inline common_type<T1, T2, Ts...> add(const T1& x, const T2& y, const Ts&... rest)
-{
-    return x + add(y, rest...);
-}
-template <typename T>
-constexpr inline T add(initialvalue<T>)
-{
-    return T(0);
-}
-KFR_FN(add)
-
-/**
- * @brief Returns template expression that returns sum of all the arguments passed to a function.
- */
-template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)>
-CMT_INLINE internal::expression_function<fn::add, E...> add(E&&... x)
-{
-    return { fn::add(), std::forward<E>(x)... };
-}
-
-template <typename T1, typename T2>
-constexpr inline common_type<T1, T2> sub(const T1& x, const T2& y)
-{
-    return x - y;
-}
-template <typename T>
-constexpr inline T sub(initialvalue<T>)
-{
-    return T(0);
-}
-KFR_FN(sub)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::sub, E1, E2> sub(E1&& x, E2&& y)
-{
-    return { fn::sub(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-template <typename T1>
-constexpr inline T1 mul(const T1& x)
-{
-    return x;
-}
-
-/**
- * @brief Returns product of all the arguments passed to a function.
- */
-template <typename T1, typename T2, typename... Ts>
-constexpr inline common_type<T1, T2, Ts...> mul(const T1& x, const T2& y, const Ts&... rest)
-{
-    return x * mul(y, rest...);
-}
-
-template <typename T>
-constexpr inline T mul(initialvalue<T>)
-{
-    return T(1);
-}
-KFR_FN(mul)
-
-/**
- * @brief Returns template expression that returns product of all the arguments passed to a function.
- */
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::mul, E...> mul(E&&... x)
-{
-    return { fn::mul(), std::forward<E>(x)... };
-}
-
-/**
- * @brief Returns square of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-constexpr inline T1 sqr(const T1& x)
-{
-    return x * x;
-}
-KFR_FN(sqr)
-
-/**
- * @brief Returns template expression that returns square of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::sqr, E1> sqr(E1&& x)
-{
-    return { fn::sqr(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns cube of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-constexpr inline T1 cub(const T1& x)
-{
-    return sqr(x) * x;
-}
-KFR_FN(cub)
-
-/**
- * @brief Returns template expression that returns cube of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::cub, E1> cub(E1&& x)
-{
-    return { fn::cub(), std::forward<E1>(x) };
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow2(const T& x)
-{
-    return sqr(x);
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow3(const T& x)
-{
-    return cub(x);
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow4(const T& x)
-{
-    return sqr(sqr(x));
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow5(const T& x)
-{
-    return pow4(x) * x;
-}
-KFR_FN(pow2)
-KFR_FN(pow3)
-KFR_FN(pow4)
-KFR_FN(pow5)
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow2, E1> pow2(E1&& x)
-{
-    return { fn::pow2(), std::forward<E1>(x) };
-}
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow3, E1> pow3(E1&& x)
-{
-    return { fn::pow3(), std::forward<E1>(x) };
-}
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow4, E1> pow4(E1&& x)
-{
-    return { fn::pow4(), std::forward<E1>(x) };
-}
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow5, E1> pow5(E1&& x)
-{
-    return { fn::pow5(), std::forward<E1>(x) };
-}
-
-/// Raise x to the power base \f$ x^{base} \f$
-/// @code
-/// CHECK( ipow( 10, 3 ) == 1000 );
-/// CHECK( ipow( 0.5, 2 ) == 0.25 );
-/// @endcode
-template <typename T>
-constexpr inline T ipow(const T& x, int base)
-{
-    T xx     = x;
-    T result = T(1);
-    while (base)
-    {
-        if (base & 1)
-            result *= xx;
-        base >>= 1;
-        xx *= xx;
-    }
-    return result;
-}
-KFR_FN(ipow)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::ipow, E1, E2> ipow(E1&& x, E2&& b)
-{
-    return { fn::ipow(), std::forward<E1>(x), std::forward<E2>(b) };
-}
-
-/// Return square of the sum of all arguments
-/// @code
-/// CHECK(sqrsum(1,2,3) == 36);
-/// @endcode
-template <typename T1, typename... Ts>
-constexpr inline common_type<T1, Ts...> sqrsum(const T1& x, const Ts&... rest)
-{
-    return sqr(add(x, rest...));
-}
-
-template <typename T1, typename T2>
-constexpr inline common_type<T1, T2> sqrdiff(const T1& x, const T2& y)
-{
-    return sqr(x - y);
-}
-KFR_FN(sqrsum)
-KFR_FN(sqrdiff)
-
-/// Division
-template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
-CMT_INLINE Tout div(const T1& x, const T2& y)
-{
-    return static_cast<Tout>(x) / static_cast<Tout>(y);
-}
-KFR_FN(div)
-
-/// Remainder
-template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
-CMT_INLINE Tout rem(const T1& x, const T2& y)
-{
-    return static_cast<Tout>(x) % static_cast<Tout>(y);
-}
-KFR_FN(rem)
-
-/// Negation
-template <typename T1>
-inline T1 neg(const T1& x)
-{
-    return -x;
-}
-KFR_FN(neg)
-
-/// @brief Fused Multiply-Add
-template <typename T1, typename T2, typename T3>
-KFR_INTRIN constexpr common_type<T1, T2, T3> fmadd(const T1& x, const T2& y, const T3& z)
-{
-    return x * y + z;
-}
-/// @brief Fused Multiply-Sub
-template <typename T1, typename T2, typename T3>
-KFR_INTRIN constexpr common_type<T1, T2, T3> fmsub(const T1& x, const T2& y, const T3& z)
-{
-    return x * y - z;
-}
-KFR_FN(fmadd)
-KFR_FN(fmsub)
-
-/// @brief Linear blend of `x` and `y` (`c` must be in the range 0...+1)
-/// Returns `x + ( y - x ) * c`
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INTRIN constexpr common_type<T1, T2, T3> mix(const T1& c, const T2& x, const T3& y)
-{
-    return fmadd(c, y - x, x);
-}
-
-/// @brief Linear blend of `x` and `y` (`c` must be in the range -1...+1)
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INTRIN constexpr common_type<T1, T2, T3> mixs(const T1& c, const T2& x, const T3& y)
-{
-    return mix(fmadd(c, 0.5, 0.5), x, y);
-}
-KFR_FN(mix)
-KFR_FN(mixs)
-
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-CMT_INLINE internal::expression_function<fn::mix, E1, E2, E3> mix(E1&& c, E2&& x, E3&& y)
-{
-    return { fn::mix(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
-}
-
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-CMT_INLINE internal::expression_function<fn::mixs, E1, E2, E3> mixs(E1&& c, E2&& x, E3&& y)
-{
-    return { fn::mixs(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
-}
-
-namespace internal
-{
-
-template <typename T1, typename T2>
-constexpr CMT_INLINE common_type<T1, T2> horner(const T1&, const T2& c0)
-{
-    return c0;
-}
-
-template <typename T1, typename T2, typename T3, typename... Ts>
-constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner(const T1& x, const T2& c0, const T3& c1,
-                                                           const Ts&... values)
-{
-    return fmadd(horner(x, c1, values...), x, c0);
-}
-
-template <typename T1, typename T2>
-constexpr CMT_INLINE common_type<T1, T2> horner_even(const T1&, const T2& c0)
-{
-    return c0;
-}
-
-template <typename T1, typename T2, typename T3, typename... Ts>
-constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner_even(const T1& x, const T2& c0, const T3& c2,
-                                                                const Ts&... values)
-{
-    const T1 x2 = x * x;
-    return fmadd(horner(x2, c2, values...), x2, c0);
-}
-
-template <typename T1, typename T2>
-constexpr CMT_INLINE common_type<T1, T2> horner_odd(const T1& x, const T2& c1)
-{
-    return c1 * x;
-}
-
-template <typename T1, typename T2, typename T3, typename... Ts>
-constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner_odd(const T1& x, const T2& c1, const T3& c3,
-                                                               const Ts&... values)
-{
-    const T1 x2 = x * x;
-    return fmadd(horner(x2, c3, values...), x2, c1) * x;
-}
-} // namespace internal
-
-/// @brief Calculate polynomial using Horner's method
-///
-/// ``horner(x, 1, 2, 3)`` is equivalent to \(3x^2 + 2x + 1\)
-template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
-constexpr CMT_INLINE common_type<T1, Ts...> horner(const T1& x, const Ts&... c)
-{
-    return internal::horner(x, c...);
-}
-KFR_FN(horner)
-
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::horner, E...> horner(E&&... x)
-{
-    return { fn::horner(), std::forward<E>(x)... };
-}
-
-/// @brief Calculate polynomial using Horner's method (even powers)
-///
-/// ``horner_even(x, 1, 2, 3)`` is equivalent to \(3x^4 + 2x^2 + 1\)
-template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
-constexpr CMT_INLINE common_type<T1, Ts...> horner_even(const T1& x, const Ts&... c)
-{
-    return internal::horner_even(x, c...);
-}
-KFR_FN(horner_even)
-
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::horner_even, E...> horner_even(E&&... x)
-{
-    return { fn::horner_even(), std::forward<E>(x)... };
-}
-
-/// @brief Calculate polynomial using Horner's method (odd powers)
-///
-/// ``horner_odd(x, 1, 2, 3)`` is equivalent to \(3x^5 + 2x^3 + 1x\)
-template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
-constexpr CMT_INLINE common_type<T1, Ts...> horner_odd(const T1& x, const Ts&... c)
-{
-    return internal::horner_odd(x, c...);
-}
-KFR_FN(horner_odd)
-
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::horner_odd, E...> horner_odd(E&&... x)
-{
-    return { fn::horner_odd(), std::forward<E>(x)... };
-}
-
-/// @brief Calculate Multiplicative Inverse of `x`
-/// Returns `1/x`
-template <typename T>
-constexpr CMT_INLINE T reciprocal(const T& x)
-{
-    static_assert(std::is_floating_point<subtype<T>>::value, "T must be floating point type");
-    return subtype<T>(1) / x;
-}
-KFR_FN(reciprocal)
-
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> mulsign(const T1& x, const T2& y)
-{
-    return bitwisexor(x, bitwiseand(y, constants<T2>::highbitmask()));
-}
-KFR_FN(mulsign)
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return (x & constants<T>::highbitmask()) | (y & constants<T>::highbitmask());
-}
-
-/// @brief Swap byte order
-template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)>
-CMT_INLINE vec<T, N> swapbyteorder(const vec<T, N>& x)
-{
-    return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x)));
-}
-template <typename T, KFR_ENABLE_IF(sizeof(T) == 8)>
-CMT_INLINE T swapbyteorder(const T& x)
-{
-    return reinterpret_cast<const T&>(__builtin_bswap64(reinterpret_cast<const u64&>(x)));
-}
-template <typename T, KFR_ENABLE_IF(sizeof(T) == 4)>
-CMT_INLINE T swapbyteorder(const T& x)
-{
-    return reinterpret_cast<const T&>(__builtin_bswap32(reinterpret_cast<const u32&>(x)));
-}
-template <typename T, KFR_ENABLE_IF(sizeof(T) == 2)>
-CMT_INLINE T swapbyteorder(const T& x)
-{
-    return reinterpret_cast<const T&>(__builtin_bswap16(reinterpret_cast<const u16&>(x)));
-}
-KFR_FN(swapbyteorder)
-
-template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return blend<1, 0>(a + b, a - b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b)
-{
-    return blend<0, 1>(a + b, a - b);
-}
-KFR_FN(subadd)
-KFR_FN(addsub)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> negeven(const vec<T, N>& x)
-{
-    return x ^ broadcast<N>(-T(), T());
-}
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> negodd(const vec<T, N>& x)
-{
-    return x ^ broadcast<N>(T(), -T());
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::neg, E1> operator-(E1&& e1)
-{
-    return { fn::neg(), std::forward<E1>(e1) };
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::bitwisenot, E1> operator~(E1&& e1)
-{
-    return { fn::bitwisenot(), std::forward<E1>(e1) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::add, E1, E2> operator+(E1&& e1, E2&& e2)
-{
-    return { fn::add(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::sub, E1, E2> operator-(E1&& e1, E2&& e2)
-{
-    return { fn::sub(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::mul, E1, E2> operator*(E1&& e1, E2&& e2)
-{
-    return { fn::mul(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::div, E1, E2> operator/(E1&& e1, E2&& e2)
-{
-    return { fn::div(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::bitwiseand, E1, E2> operator&(E1&& e1, E2&& e2)
-{
-    return { fn::bitwiseand(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::bitwiseor, E1, E2> operator|(E1&& e1, E2&& e2)
-{
-    return { fn::bitwiseor(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::bitwisexor, E1, E2> operator^(E1&& e1, E2&& e2)
-{
-    return { fn::bitwisexor(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::shl, E1, E2> operator<<(E1&& e1, E2&& e2)
-{
-    return { fn::shl(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::shr, E1, E2> operator>>(E1&& e1, E2&& e2)
-{
-    return { fn::shr(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename T, size_t N1, size_t... Ns>
-vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest)
-{
-    const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...));
-    return compcast<vec<T, sizeof...(Ns) + 1>>(t);
-}
-
-KFR_FN(packtranspose)
-} // namespace kfr
diff --git a/include/kfr/base/platform.hpp b/include/kfr/base/platform.hpp
@@ -1,186 +0,0 @@
-/** @addtogroup types
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "types.hpp"
-
-namespace kfr
-{
-
-/// @brief An enumeration representing cpu instruction set
-enum class cpu_t : int
-{
-    common = 0,
-#ifdef CMT_ARCH_X86
-    sse2    = 1,
-    sse3    = 2,
-    ssse3   = 3,
-    sse41   = 4,
-    sse42   = 5,
-    avx1    = 6,
-    avx2    = 7,
-    avx512  = 8, // F, CD, VL, DQ and BW
-    avx     = static_cast<int>(avx1),
-    lowest  = static_cast<int>(sse2),
-    highest = static_cast<int>(avx512),
-#endif
-#ifdef CMT_ARCH_ARM
-    neon    = 1,
-    neon64  = 2,
-    lowest  = static_cast<int>(neon),
-    highest = static_cast<int>(neon64),
-#endif
-    native  = static_cast<int>(CMT_ARCH_NAME),
-    runtime = -1,
-};
-
-#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native
-
-template <cpu_t cpu>
-using ccpu_t = cval_t<cpu_t, cpu>;
-
-template <cpu_t cpu>
-constexpr ccpu_t<cpu> ccpu{};
-
-namespace internal
-{
-constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); }
-constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); }
-
-#ifdef CMT_ARCH_X86
-constexpr auto cpu_list = cvals_t<cpu_t, cpu_t::avx512, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3,
-                                  cpu_t::sse3, cpu_t::sse2>();
-#else
-constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>;
-#endif
-} // namespace internal
-
-template <cpu_t cpu>
-using cpuval_t = cval_t<cpu_t, cpu>;
-template <cpu_t cpu>
-constexpr auto cpuval = cpuval_t<cpu>{};
-
-constexpr auto cpu_all = cfilter(internal::cpu_list, internal::cpu_list >= cpuval_t<cpu_t::native>());
-
-/// @brief Returns name of the cpu instruction set
-CMT_UNUSED static const char* cpu_name(cpu_t set)
-{
-#ifdef CMT_ARCH_X86
-    static const char* names[] = { "common", "sse2", "sse3", "ssse3", "sse41",
-                                   "sse42",  "avx1", "avx2", "avx512" };
-#endif
-#ifdef CMT_ARCH_ARM
-    static const char* names[] = { "common", "neon", "neon64" };
-#endif
-    if (set >= cpu_t::lowest && set <= cpu_t::highest)
-        return names[static_cast<size_t>(set)];
-    return "-";
-}
-
-#ifdef CMT_ARCH_X64
-template <int = 0>
-constexpr inline const char* bitness_const(const char*, const char* x64)
-{
-    return x64;
-}
-template <typename T>
-constexpr inline const T& bitness_const(const T&, const T& x64)
-{
-    return x64;
-}
-#else
-template <int = 0>
-constexpr inline const char* bitness_const(const char* x32, const char*)
-{
-    return x32;
-}
-template <typename T>
-constexpr inline const T& bitness_const(const T& x32, const T&)
-{
-    return x32;
-}
-#endif
-
-template <typename T = i32, cpu_t c = cpu_t::native>
-struct platform
-{
-    constexpr static size_t native_cache_alignment        = 64;
-    constexpr static size_t native_cache_alignment_mask   = native_cache_alignment - 1;
-    constexpr static size_t maximum_vector_alignment      = 32;
-    constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
-#ifdef CMT_ARCH_X86
-    constexpr static size_t simd_register_count =
-        c >= cpu_t::avx512 ? bitness_const(8, 32) : bitness_const(8, 16);
-#endif
-#ifdef CMT_ARCH_ARM
-    constexpr static size_t simd_register_count = 16;
-#endif
-
-    constexpr static size_t common_float_vector_size = 16;
-    constexpr static size_t common_int_vector_size   = 16;
-
-#ifdef CMT_ARCH_X86
-    constexpr static size_t native_float_vector_size =
-        c >= cpu_t::avx512 ? 64 : c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size;
-#endif
-#ifdef CMT_ARCH_ARM
-    constexpr static size_t native_float_vector_size = c == cpu_t::neon ? 16 : common_float_vector_size;
-#endif
-#ifdef CMT_ARCH_X86
-    constexpr static size_t native_int_vector_size =
-        c >= cpu_t::avx512 ? 64 : c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size;
-#endif
-#ifdef CMT_ARCH_ARM
-    constexpr static size_t native_int_vector_size = c == cpu_t::neon ? 16 : common_int_vector_size;
-#endif
-
-    /// @brief SIMD vector width for the given cpu instruction set
-    constexpr static size_t vector_width =
-        (const_max(size_t(1), typeclass<T> == datatype::f ? native_float_vector_size / sizeof(T)
-                                                          : native_int_vector_size / sizeof(T)));
-
-    constexpr static size_t vector_capacity = simd_register_count * vector_width;
-
-    constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity / 4);
-
-    constexpr static size_t native_vector_alignment =
-        const_max(native_float_vector_size, native_int_vector_size);
-
-    constexpr static bool fast_unaligned =
-#ifdef CMT_ARCH_X86
-        c >= cpu_t::avx1;
-#else
-        false;
-#endif
-
-    constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
-};
-
-template <typename T, size_t N = platform<T>::vector_width>
-struct vec;
-template <typename T, size_t N = platform<T>::vector_width>
-struct mask;
-} // namespace kfr
diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp
@@ -25,14 +25,17 @@
  */
 #pragma once
 
+#include "../simd/vec.hpp"
 #include "basic_expressions.hpp"
-#include "vec.hpp"
 #include <memory>
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
-constexpr size_t maximum_expression_width = platform<float>::vector_capacity / 4;
+template <typename T>
+constexpr size_t maximum_expression_width = vector_width<T> * 2;
 
 template <typename T, bool enable_resource = true>
 struct expression_pointer;
@@ -41,11 +44,11 @@ namespace internal
 {
 
 template <typename Expression, typename T, size_t key = 0>
-KFR_SINTRIN bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer,
-                                   csize_t<key> = {});
+KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer,
+                                     csize_t<key> = {});
 }
 
-template <typename T, size_t N = maximum_expression_width>
+template <typename T, size_t N = maximum_expression_width<T>>
 struct expression_vtable : expression_vtable<T, N / 2>
 {
     using func_get = void (*)(void*, size_t, vec<T, N>&);
@@ -60,7 +63,7 @@ struct expression_vtable : expression_vtable<T, N / 2>
     template <typename Expression>
     static void static_get(void* instance, size_t index, vec<T, N>& result)
     {
-        result = static_cast<Expression*>(instance)->operator()(cinput, index, vec_t<T, N>());
+        result = get_elements(*static_cast<Expression*>(instance), cinput, index, vec_shape<T, N>());
     }
 };
 
@@ -78,7 +81,7 @@ struct expression_vtable<T, 0>
     func_substitute substitute;
 
     template <typename Expression>
-    expression_vtable(ctype_t<Expression> t)
+    expression_vtable(ctype_t<Expression>)
         : size(&expression_vtable<T, 0>::template static_size<Expression>),
           begin_block(&expression_vtable<T, 0>::template static_begin_block<Expression>),
           end_block(&expression_vtable<T, 0>::template static_end_block<Expression>),
@@ -117,7 +120,7 @@ struct expression_resource
 template <typename E>
 struct expression_resource_impl : expression_resource
 {
-    expression_resource_impl(E&& e) noexcept : e(std::move(e)) {}
+    expression_resource_impl(E&& e) CMT_NOEXCEPT : e(std::move(e)) {}
     virtual ~expression_resource_impl() {}
     virtual void* instance() override final { return &e; }
 
@@ -126,7 +129,7 @@ private:
 };
 
 template <typename E>
-KFR_SINTRIN std::shared_ptr<expression_resource> make_resource(E&& e)
+KFR_INTRINSIC std::shared_ptr<expression_resource> make_resource(E&& e)
 {
     using T = expression_resource_impl<decay<E>>;
     return std::static_pointer_cast<expression_resource>(
@@ -138,31 +141,35 @@ struct expression_pointer : input_expression
 {
     using value_type = T;
 
-    expression_pointer() noexcept : instance(nullptr), vtable(nullptr) {}
+    expression_pointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {}
     expression_pointer(void* instance, const expression_vtable<T>* vtable,
                        std::shared_ptr<expression_resource> resource = nullptr)
         : instance(instance), vtable(vtable), resource(std::move(resource))
     {
     }
-    template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width)>
-    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+    template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width<T>)>
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t, size_t index,
+                                                    vec_shape<T, N>)
     {
         static_assert(is_poweroftwo(N), "N must be a power of two");
         vec<T, N> result;
-        static_cast<const expression_vtable<T, N>*>(vtable)->get(instance, index, result);
+        static_cast<const expression_vtable<T, N>*>(self.vtable)->get(self.instance, index, result);
         return result;
     }
-    template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width)>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width<T>)>
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N>)
     {
-        return concat(operator()(cinput, index, vec_t<T, N / 2>()), operator()(cinput, index + N / 2,
-                                                                               vec_t<T, N / 2>()));
+        static_assert(is_poweroftwo(N), "N must be a power of two");
+        const vec<T, N / 2> r1 = get_elements(self, cinput, index, vec_shape<T, N / 2>());
+        const vec<T, N / 2> r2 = get_elements(self, cinput, index + N / 2, vec_shape<T, N / 2>());
+        return concat(r1, r2);
     }
-    CMT_INLINE void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); }
-    CMT_INLINE void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); }
-    CMT_INLINE size_t size() const { return vtable->size(instance); }
+    KFR_MEM_INTRINSIC void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); }
+    KFR_MEM_INTRINSIC void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); }
+    KFR_MEM_INTRINSIC size_t size() const { return vtable->size(instance); }
 
-    CMT_INLINE bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const
+    KFR_MEM_INTRINSIC bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const
     {
         return vtable->substitute(instance, std::move(new_pointer));
     }
@@ -179,7 +186,7 @@ namespace internal
 {
 
 template <typename T, typename E>
-CMT_INLINE expression_vtable<T>* make_expression_vtable()
+KFR_INTRINSIC expression_vtable<T>* make_expression_vtable()
 {
     static_assert(is_input_expression<E>::value, "E must be an expression");
     static expression_vtable<T> vtable{ ctype_t<decay<E>>{} };
@@ -192,7 +199,7 @@ CMT_INLINE expression_vtable<T>* make_expression_vtable()
  *  @warning Use with caution with local variables.
  */
 template <typename E, typename T = value_type_of<E>>
-CMT_INLINE expression_pointer<T> to_pointer(E& expr)
+KFR_INTRINSIC expression_pointer<T> to_pointer(E& expr)
 {
     static_assert(is_input_expression<E>::value, "E must be an expression");
     return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, E>());
@@ -203,7 +210,7 @@ CMT_INLINE expression_pointer<T> to_pointer(E& expr)
  *  @note Use std::move to force use of this overload.
  */
 template <typename E, typename T = value_type_of<E>>
-CMT_INLINE expression_pointer<T> to_pointer(E&& expr)
+KFR_INTRINSIC expression_pointer<T> to_pointer(E&& expr)
 {
     static_assert(is_input_expression<E>::value, "E must be an expression");
     std::shared_ptr<expression_resource> ptr = make_resource(std::move(expr));
@@ -215,24 +222,25 @@ template <typename T, size_t key>
 class expression_placeholder : public input_expression
 {
 public:
-    using value_type                  = T;
-    expression_placeholder() noexcept = default;
+    using value_type                      = T;
+    expression_placeholder() CMT_NOEXCEPT = default;
     template <typename U, size_t N>
-    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+    friend KFR_INTRINSIC vec<U, N> get_elements(const expression_placeholder& self, cinput_t,
+                                                    size_t index, vec_shape<U, N>)
     {
-        return pointer ? cast<U>(pointer(cinput, index, vec_t<T, N>())) : 0;
+        return self.pointer ? elemcast<U>(get_elements(self.pointer, cinput, index, vec_shape<T, N>())) : 0;
     }
     expression_pointer<T> pointer;
 };
 
 template <typename T, size_t key = 0>
-KFR_SINTRIN expression_placeholder<T, key> placeholder(csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC expression_placeholder<T, key> placeholder(csize_t<key> = csize_t<key>{})
 {
     return expression_placeholder<T, key>();
 }
 
 template <typename... Args>
-KFR_SINTRIN bool substitute(input_expression&, Args&&...)
+KFR_INTRINSIC bool substitute(input_expression&, Args&&...)
 {
     return false;
 }
@@ -240,28 +248,28 @@ KFR_SINTRIN bool substitute(input_expression&, Args&&...)
 namespace internal
 {
 template <typename... Args, typename T, size_t key, size_t... indices>
-KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer,
-                            csize_t<key>, csizes_t<indices...>);
+KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr,
+                              expression_pointer<T>&& new_pointer, csize_t<key>, csizes_t<indices...>);
 }
 
 template <typename T, size_t key = 0>
-KFR_SINTRIN bool substitute(expression_placeholder<T, key>& expr, expression_pointer<T>&& new_pointer,
-                            csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC bool substitute(expression_placeholder<T, key>& expr, expression_pointer<T>&& new_pointer,
+                              csize_t<key> = csize_t<key>{})
 {
     expr.pointer = std::move(new_pointer);
     return true;
 }
 
 template <typename... Args, typename T, size_t key = 0>
-KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer,
-                            csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr,
+                              expression_pointer<T>&& new_pointer, csize_t<key> = csize_t<key>{})
 {
     return internal::substitute(expr, std::move(new_pointer), csize_t<key>{}, indicesfor_t<Args...>{});
 }
 
 template <typename T, size_t key = 0>
-KFR_SINTRIN bool substitute(expression_pointer<T>& expr, expression_pointer<T>&& new_pointer,
-                            csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC bool substitute(expression_pointer<T>& expr, expression_pointer<T>&& new_pointer,
+                              csize_t<key> = csize_t<key>{})
 {
     return expr.substitute(std::move(new_pointer), csize_t<key>{});
 }
@@ -269,17 +277,17 @@ KFR_SINTRIN bool substitute(expression_pointer<T>& expr, expression_pointer<T>&&
 namespace internal
 {
 
-KFR_SINTRIN bool var_or() { return false; }
+KFR_INTRINSIC bool var_or() { return false; }
 
 template <typename... Args>
-KFR_SINTRIN bool var_or(bool b, Args... args)
+KFR_INTRINSIC bool var_or(bool b, Args... args)
 {
     return b || var_or(args...);
 }
 
 template <typename... Args, typename T, size_t key, size_t... indices>
-KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer,
-                            csize_t<key>, csizes_t<indices...>)
+KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr,
+                              expression_pointer<T>&& new_pointer, csize_t<key>, csizes_t<indices...>)
 {
     return var_or(substitute(std::get<indices>(expr.args), std::move(new_pointer), csize_t<key>())...);
 }
@@ -290,10 +298,11 @@ namespace internal
 {
 
 template <typename Expression, typename T, size_t key>
-KFR_SINTRIN bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, csize_t<key>)
+KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, csize_t<key>)
 {
     return kfr::substitute(expr, std::move(new_pointer), csize_t<key>{});
 }
 
 } // namespace internal
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup math
+/** @addtogroup random
  *  @{
  */
 /*
@@ -24,55 +24,58 @@
   See https://www.kfrlib.com for details.
  */
 #pragma once
-#include "function.hpp"
-#include "operators.hpp"
-#include "shuffle.hpp"
-#include "vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
 
-using random_state = u32x4;
-
 #ifndef KFR_DISABLE_READCYCLECOUNTER
-
 struct seed_from_rdtsc_t
 {
 };
 
 constexpr seed_from_rdtsc_t seed_from_rdtsc{};
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+using random_state = u32x4;
 
-#ifndef KFR_READCYCLECOUNTER
+#ifndef KFR_DISABLE_READCYCLECOUNTER
 #ifdef CMT_COMPILER_CLANG
-#define KFR_READCYCLECOUNTER() __builtin_readcyclecounter()
+#define KFR_builtin_readcyclecounter()                                                                       \
+    static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here
 #else
-#define KFR_READCYCLECOUNTER() __rdtsc()
+#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc())
 #endif
 #endif
 
-#endif
-
 struct random_bit_generator
 {
-
 #ifndef KFR_DISABLE_READCYCLECOUNTER
-    random_bit_generator(seed_from_rdtsc_t) noexcept
-        : state(bitcast<u32>(
-              make_vector(KFR_READCYCLECOUNTER(), (KFR_READCYCLECOUNTER() << 11) ^ 0x710686d615e2257bull)))
+    KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT
+        : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(),
+                                         (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
     {
         (void)operator()();
     }
 #endif
-    random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) noexcept : state(x0, x1, x2, x3)
+    KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT
+        : state(x0, x1, x2, x3)
     {
         (void)operator()();
     }
-    random_bit_generator(u64 x0, u64 x1) noexcept : state(bitcast<u32>(make_vector(x0, x1)))
+    KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT
+        : state(bitcast<u32>(make_vector(x0, x1)))
     {
         (void)operator()();
     }
 
-    inline random_state operator()()
+    KFR_MEM_INTRINSIC random_state operator()()
     {
         const static random_state mul{ 214013u, 17405u, 214013u, 69069u };
         const static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
@@ -87,13 +90,13 @@ protected:
 static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16");
 
 template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))>
-inline vec<u8, N> random_bits(random_bit_generator& gen)
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
 {
     return narrow<N>(bitcast<u8>(gen()));
 }
 
 template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
-inline vec<u8, N> random_bits(random_bit_generator& gen)
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
 {
     constexpr size_t N2         = prev_poweroftwo(N - 1);
     const vec<u8, N2> bits1     = random_bits<N2>(gen);
@@ -102,37 +105,37 @@ inline vec<u8, N> random_bits(random_bit_generator& gen)
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(std::is_integral<T>::value)>
-inline vec<T, N> random_uniform(random_bit_generator& gen)
+KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen)
 {
     return bitcast<T>(random_bits<N * sizeof(T)>(gen));
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(std::is_same<T, f32>::value)>
-inline vec<f32, N> randommantissa(random_bit_generator& gen)
+KFR_INTRINSIC vec<f32, N> randommantissa(random_bit_generator& gen)
 {
     return bitcast<f32>((random_uniform<u32, N>(gen) & 0x7FFFFFu) | 0x3f800000u) + 0.0f;
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(std::is_same<T, f64>::value)>
-inline vec<f64, N> randommantissa(random_bit_generator& gen)
+KFR_INTRINSIC vec<f64, N> randommantissa(random_bit_generator& gen)
 {
     return bitcast<f64>((random_uniform<u64, N>(gen) & 0x000FFFFFFFFFFFFFull) | 0x3FF0000000000000ull) + 0.0;
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-inline vec<T, N> random_uniform(random_bit_generator& gen)
+KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen)
 {
     return randommantissa<T, N>(gen) - 1.f;
 }
 
 template <size_t N, typename T, KFR_ENABLE_IF(is_f_class<T>::value)>
-inline vec<T, N> random_range(random_bit_generator& gen, T min, T max)
+KFR_INTRINSIC vec<T, N> random_range(random_bit_generator& gen, T min, T max)
 {
     return mix(random_uniform<T, N>(gen), min, max);
 }
 
 template <size_t N, typename T, KFR_ENABLE_IF(!is_f_class<T>::value)>
-inline vec<T, N> random_range(random_bit_generator& gen, T min, T max)
+KFR_INTRINSIC vec<T, N> random_range(random_bit_generator& gen, T min, T max)
 {
     using big_type = findinttype<sqr(std::numeric_limits<T>::min()), sqr(std::numeric_limits<T>::max())>;
 
@@ -147,11 +150,11 @@ template <typename T>
 struct expression_random_uniform : input_expression
 {
     using value_type = T;
-    constexpr expression_random_uniform(const random_bit_generator& gen) noexcept : gen(gen) {}
+    constexpr expression_random_uniform(const random_bit_generator& gen) CMT_NOEXCEPT : gen(gen) {}
     template <size_t N>
-    vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+    friend vec<T, N> get_elements(const expression_random_uniform& self, cinput_t, size_t, vec_shape<T, N>)
     {
-        return random_uniform<T, N>(gen);
+        return random_uniform<T, N>(self.gen);
     }
     mutable random_bit_generator gen;
 };
@@ -160,15 +163,16 @@ template <typename T>
 struct expression_random_range : input_expression
 {
     using value_type = T;
-    constexpr expression_random_range(const random_bit_generator& gen, T min, T max) noexcept
-        : gen(gen), min(min), max(max)
+    constexpr expression_random_range(const random_bit_generator& gen, T min, T max) CMT_NOEXCEPT : gen(gen),
+                                                                                                    min(min),
+                                                                                                    max(max)
     {
     }
 
     template <size_t N>
-    vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+    friend vec<T, N> get_elements(const expression_random_range& self, cinput_t, size_t, vec_shape<T, N>)
     {
-        return random_range<N, T>(gen, min, max);
+        return random_range<N, T>(self.gen, self.min, self.max);
     }
     mutable random_bit_generator gen;
     const T min;
@@ -178,16 +182,15 @@ struct expression_random_range : input_expression
 
 /// @brief Returns expression that returns pseudo random values
 template <typename T>
-inline internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen)
+KFR_FUNCTION internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen)
 {
     return internal::expression_random_uniform<T>(gen);
 }
 
-
 #ifndef KFR_DISABLE_READCYCLECOUNTER
 /// @brief Returns expression that returns pseudo random values
 template <typename T>
-inline internal::expression_random_uniform<T> gen_random_uniform()
+KFR_FUNCTION internal::expression_random_uniform<T> gen_random_uniform()
 {
     return internal::expression_random_uniform<T>(random_bit_generator(seed_from_rdtsc));
 }
@@ -195,18 +198,19 @@ inline internal::expression_random_uniform<T> gen_random_uniform()
 
 /// @brief Returns expression that returns pseudo random values of the given range
 template <typename T>
-inline internal::expression_random_range<T> gen_random_range(const random_bit_generator& gen, T min, T max)
+KFR_FUNCTION internal::expression_random_range<T> gen_random_range(const random_bit_generator& gen, T min,
+                                                                   T max)
 {
     return internal::expression_random_range<T>(gen, min, max);
 }
 
-
 #ifndef KFR_DISABLE_READCYCLECOUNTER
 /// @brief Returns expression that returns pseudo random values of the given range
 template <typename T>
-inline internal::expression_random_range<T> gen_random_range(T min, T max)
+KFR_FUNCTION internal::expression_random_range<T> gen_random_range(T min, T max)
 {
     return internal::expression_random_range<T>(random_bit_generator(seed_from_rdtsc), min, max);
 }
 #endif
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp
@@ -1,239 +0,0 @@
-/** @addtogroup types
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "shuffle.hpp"
-#include "types.hpp"
-#include "vec.hpp"
-
-namespace kfr
-{
-
-template <size_t N, bool A = false, typename T>
-CMT_INLINE static vec<T, N> read(const T* src)
-{
-    return vec<T, N>(src, cbool_t<A>());
-}
-
-template <bool A = false, size_t N, typename T>
-CMT_INLINE static void write(T* dest, const vec<T, N>& value)
-{
-    value.write(dest, cbool_t<A>());
-}
-
-template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
-CMT_INLINE vec<T, Nout> gather(const T* base, size_t index, Indices... indices)
-{
-    return make_vector(base[index], base[indices]...);
-}
-
-template <size_t Index, size_t... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
-CMT_INLINE vec<T, Nout> gather(const T* base)
-{
-    return make_vector(base[Index], base[Indices]...);
-}
-
-template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0>
-CMT_INLINE void scatter(const T* base, const vec<T, N>& value)
-{
-    base[Index] = value[InIndex];
-    scatter<Indices..., T, N, InIndex + 1>(base, value);
-}
-
-namespace internal
-{
-template <typename T, size_t N, size_t... Indices>
-CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>)
-{
-    return make_vector(base[indices[Indices]]...);
-}
-template <size_t Nout, size_t Stride, typename T, size_t... Indices>
-CMT_INLINE vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>)
-{
-    return make_vector(base[Indices * Stride]...);
-}
-template <size_t Nout, size_t groupsize, typename T, size_t... Indices>
-CMT_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>)
-{
-    return make_vector(read<groupsize>(base + Indices * groupsize * stride)...);
-}
-} // namespace internal
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices)
-{
-    return internal::gather(base, indices, csizeseq_t<N>());
-}
-
-template <size_t Nout, size_t groupsize = 1, typename T>
-CMT_INLINE vec<T, Nout * groupsize> gather_stride(const T* base, size_t stride)
-{
-    return internal::gather_stride_s<Nout, groupsize>(base, stride, csizeseq_t<Nout>());
-}
-
-template <size_t Nout, size_t Stride, typename T>
-CMT_INLINE vec<T, Nout> gather_stride(const T* base)
-{
-    return internal::gather_stride<Nout, Stride>(base, csizeseq_t<Nout>());
-}
-
-template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices>
-CMT_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, csizes_t<Indices...>)
-{
-    return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...);
-}
-template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
-{
-    return gather_helper<groupsize>(base, offset, csizeseq_t<N>());
-}
-
-template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices>
-CMT_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value,
-                               csizes_t<Indices...>)
-{
-    swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)),
-              0)... };
-}
-template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, size_t... Indices>
-CMT_INLINE void scatter_helper_s(T* base, size_t stride, const vec<T, Nout>& value, csizes_t<Indices...>)
-{
-    swallow{ (write(base + groupsize * stride, slice<Indices * groupsize, groupsize>(value)), 0)... };
-}
-template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
-CMT_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
-{
-    return scatter_helper<groupsize>(base, offset, value, csizeseq_t<N>());
-}
-
-template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
-CMT_INLINE void scatter_stride(T* base, const vec<T, Nout>& value, size_t stride)
-{
-    return scatter_helper_s<groupsize>(base, stride, value, csizeseq_t<N>());
-}
-
-template <typename T, size_t groupsize = 1>
-struct stride_pointer : public stride_pointer<const T, groupsize>
-{
-    template <size_t N>
-    void write(const vec<T, N>& val, csize_t<N> = csize_t<N>())
-    {
-        kfr::scatter_stride<N, groupsize>(this->ptr, val);
-    }
-};
-
-template <typename T, size_t groupsize>
-struct stride_pointer<const T, groupsize>
-{
-    const T* ptr;
-    const size_t stride;
-
-    template <size_t N>
-    vec<T, N> read(csize_t<N> = csize_t<N>())
-    {
-        return kfr::gather_stride<N, groupsize>(ptr, stride);
-    }
-};
-
-template <typename T>
-constexpr T partial_masks[] = { constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                constants<T>::allones(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T(),
-                                T() };
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> partial_mask(size_t index)
-{
-    static_assert(N <= arraysize(partial_masks<T>) / 2,
-                  "N must not be greater than half of partial_masks expression_array");
-    return read<N>(&partial_masks<T>[0] + arraysize(partial_masks<T>) / 2 - index);
-}
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> partial_mask(size_t index, vec_t<T, N>)
-{
-    return partial_mask<T, N>(index);
-}
-} // namespace kfr
diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup expressions
+/** @addtogroup array
  *  @{
  */
 /*
@@ -25,39 +25,41 @@
  */
 #pragma once
 
+#include "../math/min_max.hpp"
+#include "../simd/horizontal.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
 #include "basic_expressions.hpp"
-#include "function.hpp"
-#include "horizontal.hpp"
-#include "min_max.hpp"
-#include "operators.hpp"
-#include "vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 template <typename T>
-CMT_INLINE T final_mean(T value, size_t size)
+KFR_INTRINSIC T final_mean(T value, size_t size)
 {
     return value / T(size);
 }
 KFR_FN(final_mean)
 
 template <typename T>
-CMT_INLINE T final_rootmean(T value, size_t size)
+KFR_INTRINSIC T final_rootmean(T value, size_t size)
 {
-    return internal::builtin_sqrt(value / T(size));
+    return builtin_sqrt(value / T(size));
 }
 KFR_FN(final_rootmean)
 
 namespace internal
 {
 template <typename FinalFn, typename T, KFR_ENABLE_IF(is_callable<FinalFn, T, size_t>::value)>
-CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t size, T value)
+KFR_INTRINSIC auto reduce_call_final(FinalFn&& finalfn, size_t size, T value)
 {
     return finalfn(value, size);
 }
 template <typename FinalFn, typename T, KFR_ENABLE_IF(!is_callable<FinalFn, T, size_t>::value)>
-CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
+KFR_INTRINSIC auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
 {
     return finalfn(value);
 }
@@ -65,7 +67,7 @@ CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
 template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, KFR_ARCH_DEP>
 struct expression_reduce : output_expression
 {
-    constexpr static size_t width = platform<T>::vector_width * bitness_const(1, 2);
+    constexpr static size_t width = vector_width<T> * bitness_const(1, 2);
 
     using value_type = T;
 
@@ -76,26 +78,29 @@ struct expression_reduce : output_expression
     }
 
     template <size_t N>
-    CMT_INLINE void operator()(coutput_t, size_t, const vec<T, N>& x) const
+    KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<T, N>& x) const
     {
         counter += N;
         process(x);
     }
 
-    CMT_INLINE T get() { return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn)); }
+    KFR_MEM_INTRINSIC T get()
+    {
+        return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn));
+    }
 
 protected:
     void reset() { counter = 0; }
-    CMT_INLINE void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); }
+    KFR_MEM_INTRINSIC void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); }
 
     template <size_t N, KFR_ENABLE_IF(N < width)>
-    CMT_INLINE void process(const vec<T, N>& x) const
+    KFR_MEM_INTRINSIC void process(const vec<T, N>& x) const
     {
         value = combine(value, reducefn(transformfn(x), narrow<N>(value)));
     }
 
     template <size_t N, KFR_ENABLE_IF(N > width)>
-    CMT_INLINE void process(const vec<T, N>& x) const
+    KFR_MEM_INTRINSIC void process(const vec<T, N>& x) const
     {
         process(low(x));
         process(high(x));
@@ -109,10 +114,11 @@ protected:
 };
 } // namespace internal
 
-template <typename ReduceFn, typename TransformFn = fn::pass_through, typename FinalFn = fn::pass_through,
-          typename E1, typename T = value_type_of<E1>>
-KFR_SINTRIN T reduce(const E1& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn::pass_through(),
-                     FinalFn&& finalfn = fn::pass_through())
+template <typename ReduceFn, typename TransformFn = fn_generic::pass_through,
+          typename FinalFn = fn_generic::pass_through, typename E1, typename T = value_type_of<E1>>
+KFR_INTRINSIC T reduce(const E1& e1, ReduceFn&& reducefn,
+                       TransformFn&& transformfn = fn_generic::pass_through(),
+                       FinalFn&& finalfn         = fn_generic::pass_through())
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     using reducer_t = internal::expression_reduce<T, decay<ReduceFn>, decay<TransformFn>, decay<FinalFn>>;
@@ -134,7 +140,7 @@ KFR_FN(reduce)
  * \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T sum(const E1& x)
+KFR_INTRINSIC T sum(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::add());
@@ -149,10 +155,10 @@ KFR_SINTRIN T sum(const E1& x)
  * \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T mean(const E1& x)
+KFR_INTRINSIC T mean(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
-    return reduce(x, fn::add(), fn::pass_through(), fn::final_mean());
+    return reduce(x, fn::add(), fn_generic::pass_through(), fn::final_mean());
 }
 
 /**
@@ -161,7 +167,7 @@ KFR_SINTRIN T mean(const E1& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T minof(const E1& x)
+KFR_INTRINSIC T minof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::min());
@@ -173,7 +179,7 @@ KFR_SINTRIN T minof(const E1& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T maxof(const E1& x)
+KFR_INTRINSIC T maxof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::max());
@@ -185,7 +191,7 @@ KFR_SINTRIN T maxof(const E1& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T absminof(const E1& x)
+KFR_INTRINSIC T absminof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::absmin());
@@ -197,7 +203,7 @@ KFR_SINTRIN T absminof(const E1& x)
  * x must have its size and type specified.
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T absmaxof(const E1& x)
+KFR_INTRINSIC T absmaxof(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::absmax());
@@ -214,7 +220,7 @@ KFR_SINTRIN T absmaxof(const E1& x)
 template <typename E1, typename E2,
           typename T = value_type_of<decltype(std::declval<E1>() * std::declval<E2>())>,
           KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_SINTRIN T dotproduct(E1&& x, E2&& y)
+KFR_INTRINSIC T dotproduct(E1&& x, E2&& y)
 {
     auto m    = std::forward<E1>(x) * std::forward<E2>(y);
     using E12 = decltype(m);
@@ -231,7 +237,7 @@ KFR_SINTRIN T dotproduct(E1&& x, E2&& y)
    \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T rms(const E1& x)
+KFR_INTRINSIC T rms(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::add(), fn::sqr(), fn::final_rootmean());
@@ -246,7 +252,7 @@ KFR_SINTRIN T rms(const E1& x)
    \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T sumsqr(const E1& x)
+KFR_INTRINSIC T sumsqr(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::add(), fn::sqr());
@@ -261,9 +267,11 @@ KFR_SINTRIN T sumsqr(const E1& x)
    \f]
  */
 template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T product(const E1& x)
+KFR_INTRINSIC T product(const E1& x)
 {
     static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
     return reduce(x, fn::mul());
 }
+
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -1,158 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/round.hpp"
-
-namespace kfr
-{
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 floor(const T1& x)
-{
-    return intrinsics::floor(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::floor, E1> floor(E1&& x)
-{
-    return { fn::floor(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 ceil(const T1& x)
-{
-    return intrinsics::ceil(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::ceil, E1> ceil(E1&& x)
-{
-    return { fn::ceil(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 round(const T1& x)
-{
-    return intrinsics::round(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::round, E1> round(E1&& x)
-{
-    return { fn::round(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 trunc(const T1& x)
-{
-    return intrinsics::trunc(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::trunc, E1> trunc(E1&& x)
-{
-    return { fn::trunc(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 fract(const T1& x)
-{
-    return intrinsics::fract(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::fract, E1> fract(E1&& x)
-{
-    return { fn::fract(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> ifloor(const T1& x)
-{
-    return intrinsics::ifloor(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::ifloor, E1> ifloor(E1&& x)
-{
-    return { fn::ifloor(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> iceil(const T1& x)
-{
-    return intrinsics::iceil(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::iceil, E1> iceil(E1&& x)
-{
-    return { fn::iceil(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> iround(const T1& x)
-{
-    return intrinsics::iround(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::iround, E1> iround(E1&& x)
-{
-    return { fn::iround(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> itrunc(const T1& x)
-{
-    return intrinsics::itrunc(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::itrunc, E1> itrunc(E1&& x)
-{
-    return { fn::itrunc(), std::forward<E1>(x) };
-}
-
-template <typename T, KFR_ENABLE_IF(is_f_class<T>::value)>
-CMT_INLINE T fmod(const T& x, const T& y)
-{
-    return x - trunc(x / y) * y;
-}
-KFR_FN(fmod)
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-constexpr CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x % y;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return fmod(x, y);
-}
-} // namespace kfr
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -1,62 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/saturation.hpp"
-
-namespace kfr
-{
-
-/// @brief Adds two arguments using saturation
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
-          typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout satadd(const T1& x, const T2& y)
-{
-    return intrinsics::satadd(x, y);
-}
-
-/// @brief Creates an expression that adds two arguments using saturation
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::satadd, E1, E2> satadd(E1&& x, E2&& y)
-{
-    return { fn::satadd(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Subtracts two arguments using saturation
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
-          typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout satsub(const T1& x, const T2& y)
-{
-    return intrinsics::satsub(x, y);
-}
-
-/// @brief Creates an expression that subtracts two arguments using saturation
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::satsub, E1, E2> satsub(E1&& x, E2&& y)
-{
-    return { fn::satsub(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -1,57 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/select.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns x if m is true, otherwise return y. Order of the arguments is same as in ternary operator.
- * @code
- * return m ? x : y
- * @endcode
- */
-template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
-          typename Tout = subtype<common_type<T2, T3>>>
-KFR_INTRIN vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
-{
-    static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
-    return intrinsics::select(bitcast<Tout>(m.asvec()).asmask(), static_cast<vec<Tout, N>>(x),
-                              static_cast<vec<Tout, N>>(y));
-}
-
-/**
- * @brief Returns template expression that returns x if m is true, otherwise return y. Order of the arguments
- * is same as in ternary operator.
- */
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INTRIN internal::expression_function<fn::select, E1, E2, E3> select(E1&& m, E2&& x, E3&& y)
-{
-    return { fn::select(), std::forward<E1>(m), std::forward<E2>(x), std::forward<E3>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/shuffle.hpp b/include/kfr/base/shuffle.hpp
@@ -1,625 +0,0 @@
-/** @addtogroup shuffle
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "constants.hpp"
-#include "expression.hpp"
-#include "types.hpp"
-#include "vec.hpp"
-
-#include <utility>
-
-namespace kfr
-{
-
-namespace internal
-{
-
-template <typename T, typename... Ts, size_t... indices, size_t Nin = sizeof...(Ts),
-          size_t Nout = sizeof...(indices)>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, Nout> broadcast_helper(csizes_t<indices...>, const Ts&... values)
-{
-    const std::tuple<Ts...> tup(values...);
-    return vec<T, Nout>(std::get<indices % Nin>(tup)...);
-}
-} // namespace internal
-
-template <size_t Nout, typename... Ts, typename C = typename std::common_type<Ts...>::type>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<C, Nout> broadcast(const Ts&... values)
-{
-    return internal::broadcast_helper<C>(csizeseq_t<Nout>(), values...);
-}
-KFR_FN(broadcast)
-
-template <size_t Ncount, typename T, size_t N>
-CMT_INLINE vec<T, N + Ncount> padhigh(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<N + Ncount>());
-}
-KFR_FN(padhigh)
-
-template <size_t Ncount, typename T, size_t N>
-CMT_INLINE vec<T, N + Ncount> padlow(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<N + Ncount, 0 - Ncount>());
-}
-KFR_FN(padlow)
-
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N != Nout)>
-CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<Nout>());
-}
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N == Nout)>
-constexpr CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x)
-{
-    return x;
-}
-KFR_FN(extend)
-
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, count> slice(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<count, start>());
-}
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x.shuffle(y, csizeseq_t<count, start>());
-}
-KFR_FN(slice)
-
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, N> replace(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x.shuffle(
-        y, csizeseq_t<N>() +
-               (csizeseq_t<N>() >= csize_t<start>() && csizeseq_t<N>() < csize_t<start + count>()) * N);
-}
-KFR_FN(replace)
-
-template <size_t, typename T, size_t N>
-CMT_INLINE void split(const vec<T, N>&)
-{
-}
-template <size_t start = 0, typename T, size_t N, size_t Nout, typename... Args>
-CMT_INLINE void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args)
-{
-    out = x.shuffle(csizeseq_t<Nout, start>());
-    split<start + Nout>(x, std::forward<Args>(args)...);
-}
-template <typename T, size_t N>
-CMT_INLINE void split(const vec<T, N>& x, vec<T, N / 2>& low, vec<T, N / 2>& high)
-{
-    low  = x.shuffle(csizeseq_t<N / 2, 0>());
-    high = x.shuffle(csizeseq_t<N / 2, N / 2>());
-}
-template <typename T, size_t N>
-CMT_INLINE void split(const vec<T, N>& x, vec<T, N / 4>& w0, vec<T, N / 4>& w1, vec<T, N / 4>& w2,
-                      vec<T, N / 4>& w3)
-{
-    w0 = x.shuffle(csizeseq_t<N / 4, 0>());
-    w1 = x.shuffle(csizeseq_t<N / 4, N / 4>());
-    w2 = x.shuffle(csizeseq_t<N / 4, 2 * N / 4>());
-    w3 = x.shuffle(csizeseq_t<N / 4, 3 * N / 4>());
-}
-KFR_FN(split)
-
-template <size_t total, size_t number, typename T, size_t N, size_t Nout = N / total>
-CMT_INLINE vec<T, Nout> part(const vec<T, N>& x)
-{
-    static_assert(N % total == 0, "N % total == 0");
-    return x.shuffle(csizeseq_t<Nout, number * Nout>());
-}
-KFR_FN(part)
-
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x.shuffle(y, csizeseq_t<count, start>());
-}
-
-template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
-CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
-{
-    return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<count, start>());
-}
-
-template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
-CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
-{
-    return x.shuffle(csizeseq_t<N2, -(N2 - N1)>())
-        .shuffle(y, csizeseq_t<N2 * 2>())
-        .shuffle(csizeseq_t<count, N2 - N1 + start>());
-}
-
-KFR_FN(concat_and_slice)
-
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout > N)>
-CMT_INLINE vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T())
-{
-    static_assert(Nout > N, "Nout > N");
-    return concat(x, broadcast<Nout - N>(newvalue));
-}
-template <size_t Nout, typename T, typename TS>
-constexpr CMT_INLINE vec<T, Nout> widen(const vec<T, Nout>& x, TS)
-{
-    return x;
-}
-KFR_FN(widen)
-
-template <size_t Nout, typename T, size_t N>
-CMT_INLINE vec<T, Nout> narrow(const vec<T, N>& x)
-{
-    static_assert(Nout <= N, "Nout <= N");
-    return slice<0, Nout>(x);
-}
-KFR_FN(narrow)
-
-template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
-CMT_INLINE vec<T, Nout> even(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(csizeseq_t<Nout / group, 0, 2>()));
-}
-KFR_FN(even)
-
-template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
-CMT_INLINE vec<T, Nout> odd(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(csizeseq_t<Nout / group, 1, 2>()));
-}
-KFR_FN(odd)
-
-namespace internal
-{
-template <size_t groupsize = 2>
-struct shuffle_index_dup1
-{
-    constexpr inline size_t operator()(size_t index) const { return index / groupsize; }
-};
-
-template <size_t groupsize = 2, size_t start = 0>
-struct shuffle_index_dup
-{
-    constexpr inline size_t operator()(size_t index) const { return start + index / groupsize * groupsize; }
-};
-} // namespace internal
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> dupeven(const vec<T, N>& x)
-{
-    static_assert(N % 2 == 0, "N must be even");
-    return x.shuffle(csizeseq_t<N, 0, 1>() & ~csize_t<1>());
-}
-KFR_FN(dupeven)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> dupodd(const vec<T, N>& x)
-{
-    static_assert(N % 2 == 0, "N must be even");
-    return x.shuffle(csizeseq_t<N, 0, 1>() | csize_t<1>());
-}
-KFR_FN(dupodd)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N * 2> duphalfs(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<N * 2>() % csize_t<N>());
-}
-KFR_FN(duphalfs)
-
-namespace internal
-{
-template <size_t size, size_t... Indices>
-struct shuffle_index_shuffle
-{
-    constexpr static size_t indexcount = sizeof...(Indices);
-
-    template <size_t index>
-    constexpr inline size_t operator()(csize_t<index>) const
-    {
-        return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount;
-    }
-};
-} // namespace internal
-
-template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y,
-                             elements_t<Indices...> i = elements_t<Indices...>())
-{
-    return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] +
-                            csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
-}
-KFR_FN(shuffle)
-
-template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y,
-                                   elements_t<Indices...> i = elements_t<Indices...>())
-{
-    return x.shuffle(y, scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
-                                     csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
-}
-KFR_FN(shufflegroups)
-
-namespace internal
-{
-template <size_t size, size_t... Indices>
-struct shuffle_index_permute
-{
-    constexpr static size_t indexcount = sizeof...(Indices);
-
-    template <size_t index>
-    constexpr inline size_t operator()(csize_t<index>) const
-    {
-        return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount;
-    }
-};
-} // namespace internal
-
-template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
-{
-    return x.shuffle(i[csizeseq_t<N>() % csize_t<count>()] +
-                     csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
-}
-KFR_FN(permute)
-
-template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
-{
-    return x.shuffle(scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
-                                  csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
-}
-KFR_FN(permutegroups)
-
-namespace internal
-{
-
-template <typename T, size_t Nout, typename Fn, size_t... Indices>
-constexpr CMT_INLINE vec<T, Nout> generate_vector(csizes_t<Indices...>)
-{
-    return make_vector(static_cast<T>(Fn()(Indices))...);
-}
-} // namespace internal
-
-template <typename T, size_t Nout, typename Fn>
-constexpr CMT_INLINE vec<T, Nout> generate_vector()
-{
-    return internal::generate_vector<T, Nout, Fn>(cvalseq_t<size_t, Nout>());
-}
-KFR_FN(generate_vector)
-
-namespace internal
-{
-template <typename T, size_t N>
-constexpr CMT_INLINE mask<T, N> evenmask()
-{
-    return broadcast<N, T>(maskbits<T>(true), maskbits<T>(false));
-}
-template <typename T, size_t N>
-constexpr CMT_INLINE mask<T, N> oddmask()
-{
-    return broadcast<N, T>(maskbits<T>(false), maskbits<T>(true));
-}
-} // namespace internal
-
-template <typename T, size_t N, size_t Nout = N * 2>
-CMT_INLINE vec<T, Nout> dup(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<Nout>() / csize_t<2>());
-}
-KFR_FN(dup)
-
-namespace internal
-{
-template <size_t count, size_t start = 0>
-struct shuffle_index_duphalf
-{
-    constexpr inline size_t operator()(size_t index) const { return start + (index) % count; }
-};
-} // namespace internal
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> duplow(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>());
-}
-KFR_FN(duplow)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> duphigh(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>() + csize_t<N - N / 2>());
-}
-KFR_FN(duphigh)
-
-namespace internal
-{
-template <size_t size, size_t... Indices>
-struct shuffle_index_blend
-{
-    constexpr static size_t indexcount = sizeof...(Indices);
-
-    template <size_t index>
-    constexpr inline size_t operator()(csize_t<index>) const
-    {
-        return (elements_t<Indices...>::get(csize_t<index % indexcount>()) ? size : 0) + index % size;
-    }
-};
-} // namespace internal
-
-template <size_t... Indices, typename T, size_t N>
-CMT_INLINE vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y,
-                           elements_t<Indices...> i = elements_t<Indices...>())
-{
-    return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] * csize_t<N>() + csizeseq_t<N>());
-}
-KFR_FN(blend)
-
-namespace internal
-{
-template <size_t elements>
-struct shuffle_index_swap
-{
-    constexpr inline size_t operator()(size_t index) const
-    {
-        static_assert(is_poweroftwo(elements), "is_poweroftwo( elements )");
-        return index ^ (elements - 1);
-    }
-};
-template <size_t amount, size_t N>
-struct shuffle_index_outputright
-{
-    constexpr inline size_t operator()(size_t index) const
-    {
-        return index < N - amount ? index : index + amount;
-    }
-};
-} // namespace internal
-
-template <size_t elements = 2, typename T, size_t N>
-CMT_INLINE vec<T, N> swap(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<N>() ^ csize_t<elements - 1>());
-}
-CMT_FN_TPL((size_t elements), (elements), swap)
-
-template <size_t shift, typename T, size_t N>
-CMT_INLINE vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi)
-{
-    return shift == 0 ? lo : (shift == N ? hi : hi.shuffle(lo, csizeseq_t<N, N - shift>()));
-}
-
-template <size_t amount, typename T, size_t N>
-CMT_INLINE vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
-{
-    static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
-    return x.shuffle(csizeseq_t<N, N - amount>() % csize_t<N>());
-}
-KFR_FN(rotateright)
-
-template <size_t amount, typename T, size_t N>
-CMT_INLINE vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
-{
-    static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
-    return x.shuffle(csizeseq_t<N, amount>() % csize_t<N>());
-}
-KFR_FN(rotateleft)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> insertright(T x, const vec<T, N>& y)
-{
-    return concat_and_slice<1, N>(y, vec<T, 1>(x));
-}
-KFR_FN(insertright)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> insertleft(T x, const vec<T, N>& y)
-{
-    return concat_and_slice<0, N>(vec<T, 1>(x), y);
-}
-KFR_FN(insertleft)
-
-// template <typename T, size_t N, size_t N2>
-// CMT_INLINE vec<T, N> outputright(const vec<T, N>& x, const vec<T, N2>& y)
-//{
-//    return shufflevector<N, internal::shuffle_index_outputright<N2, N>>(x, extend<N>(y));
-//}
-// KFR_FN(outputright)
-
-namespace internal
-{
-template <size_t size, size_t side1>
-struct shuffle_index_transpose
-{
-    constexpr inline size_t operator()(size_t index) const
-    {
-        return index % (size / side1) * side1 + index / (size / side1);
-    }
-};
-} // namespace internal
-
-template <size_t side1, size_t group = 1, typename T, size_t N, size_t size = N / group,
-          size_t side2 = size / side1, KFR_ENABLE_IF(size > 3)>
-CMT_INLINE vec<T, N> transpose(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
-                                  csizeseq_t<size>() / csize_t<side2>()));
-}
-template <size_t side, size_t group = 1, typename T, size_t N, KFR_ENABLE_IF(N / group <= 3)>
-CMT_INLINE vec<T, N> transpose(const vec<T, N>& x)
-{
-    return x;
-}
-template <typename T, size_t N>
-CMT_INLINE vec<vec<T, N>, N> transpose(const vec<vec<T, N>, N>& x)
-{
-    return vec<vec<T, N>, N>(transpose<2>(x.flatten()));
-}
-KFR_FN(transpose)
-
-template <size_t side2, size_t group = 1, typename T, size_t N, size_t size = N / group,
-          size_t side1 = size / side2, KFR_ENABLE_IF(size > 3)>
-CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
-                                  csizeseq_t<size>() / csize_t<side2>()));
-}
-template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)>
-CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
-{
-    return x;
-}
-KFR_FN(transposeinverse)
-
-template <size_t side, typename T, size_t N>
-CMT_INLINE vec<T, N> ctranspose(const vec<T, N>& x)
-{
-    return transpose<side, 2>(x);
-}
-KFR_FN(ctranspose)
-
-template <size_t side, typename T, size_t N>
-CMT_INLINE vec<T, N> ctransposeinverse(const vec<T, N>& x)
-{
-    return transposeinverse<side, 2>(x);
-}
-KFR_FN(ctransposeinverse)
-
-template <size_t group = 1, typename T, size_t N, size_t Nout = N * 2, size_t size = Nout / group,
-          size_t side2 = 2, size_t side1 = size / side2>
-CMT_INLINE vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x.shuffle(y, scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
-                                     csizeseq_t<size>() / csize_t<side2>()));
-}
-KFR_FN(interleave)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::interleave, E1, E2> interleave(E1&& x, E2&& y)
-{
-    return { fn::interleave(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side2 = 2,
-          size_t side1 = size / side2>
-CMT_INLINE vec<T, N> interleavehalfs(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
-                                  csizeseq_t<size>() / csize_t<side2>()));
-}
-KFR_FN(interleavehalfs)
-
-template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side1 = 2,
-          size_t side2 = size / side1>
-CMT_INLINE vec<T, N> splitpairs(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
-                                  csizeseq_t<size>() / csize_t<side2>()));
-}
-KFR_FN(splitpairs)
-
-namespace internal
-{
-template <size_t size>
-struct shuffle_index_reverse
-{
-    constexpr inline size_t operator()(size_t index) const { return size - 1 - index; }
-};
-} // namespace internal
-
-template <size_t group = 1, typename T, size_t N, size_t size = N / group>
-CMT_INLINE vec<T, N> reverse(const vec<T, N>& x)
-{
-    return x.shuffle(scale<group>(csizeseq_t<size, size - 1, -1>()));
-}
-template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<vec<T, N1>, N2> reverse(const vec<vec<T, N1>, N2>& x)
-{
-    return vec<vec<T, N1>, N2>(swap<N1>(x.flatten()));
-}
-KFR_FN(reverse)
-
-namespace internal
-{
-template <size_t N1, size_t N2>
-struct shuffle_index_combine
-{
-    constexpr inline size_t operator()(size_t index) const { return index >= N2 ? index : N1 + index; }
-};
-} // namespace internal
-
-template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y)
-{
-    static_assert(N2 <= N1, "N2 <= N1");
-    return x.shuffle(extend<N1>(y), (csizeseq_t<N1>() < csize_t<N2>()) * csize_t<N1>() + csizeseq_t<N1>());
-    //    return shufflevector<N1, internal::shuffle_index_combine<N1, N2>>(x, extend<N1>(y));
-}
-KFR_FN(combine)
-
-namespace internal
-{
-template <size_t start, size_t stride>
-struct generate_index
-{
-    CMT_INLINE constexpr size_t operator()(size_t index) const { return start + index * stride; }
-};
-template <size_t start, size_t size, int on, int off>
-struct generate_onoff
-{
-    CMT_INLINE constexpr size_t operator()(size_t index) const
-    {
-        return index >= start && index < start + size ? on : off;
-    }
-};
-} // namespace internal
-
-template <typename T, size_t N, size_t start = 0, size_t stride = 1>
-constexpr CMT_INLINE vec<T, N> enumerate()
-{
-    return generate_vector<T, N, internal::generate_index<start, stride>>();
-}
-template <size_t start = 0, size_t stride = 1, typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> enumerate(vec_t<T, N>)
-{
-    return generate_vector<T, N, internal::generate_index<start, stride>>();
-}
-KFR_FN(enumerate)
-
-template <typename T, size_t N, size_t start = 0, size_t size = 1, int on = 1, int off = 0>
-constexpr CMT_INLINE vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
-{
-    return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
-}
-template <size_t start = 0, size_t size = 1, int on = 1, int off = 0, typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> onoff(vec_t<T, N>, cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
-{
-    return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
-}
-KFR_FN(onoff)
-} // namespace kfr
-#define KFR_SHUFFLE_SPECIALIZATIONS 1
-#include "specializations.i"
diff --git a/include/kfr/base/simd_clang.hpp b/include/kfr/base/simd_clang.hpp
@@ -1,350 +0,0 @@
-/** @addtogroup types
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "kfr.h"
-#include "platform.hpp"
-#include "types.hpp"
-
-#if CMT_COMPILER_CLANG
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4324))
-
-namespace kfr
-{
-
-template <typename T, size_t... Ns>
-constexpr vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept;
-
-#define KFR_NATIVE_INTRINSICS 1
-
-namespace internal
-{
-template <typename TT, size_t NN>
-using simd_type = TT __attribute__((ext_vector_type(NN)));
-
-template <typename T, size_t N, bool A>
-using simd_storage = internal::struct_with_alignment<simd_type<T, N>, A>;
-
-template <typename T, size_t N, size_t... indices>
-CMT_INLINE simd_type<T, sizeof...(indices)> simd_shuffle(const simd_type<T, N>& x, const simd_type<T, N>& y,
-                                                         csizes_t<indices...>)
-{
-    return __builtin_shufflevector(x, y, ((indices >= N * 2) ? -1 : static_cast<int>(indices))...);
-}
-template <typename T, size_t N, size_t... indices>
-CMT_INLINE simd_type<T, sizeof...(indices)> simd_shuffle(const simd_type<T, N>& x, csizes_t<indices...>)
-{
-    return __builtin_shufflevector(x, x, ((indices >= N) ? -1 : static_cast<int>(indices))...);
-}
-
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-CMT_INLINE simd_type<T, N> simd_read(const T* src)
-{
-    return ptr_cast<simd_storage<T, N, A>>(src)->value;
-}
-
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-CMT_INLINE simd_type<T, N> simd_read(const T* src)
-{
-    constexpr size_t first = prev_poweroftwo(N);
-    constexpr size_t rest  = N - first;
-    constexpr auto extend_indices =
-        cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>());
-    constexpr auto concat_indices = cvalseq_t<size_t, N>();
-    return simd_shuffle<T, first>(simd_read<first, A>(src),
-                                  simd_shuffle<T, rest>(simd_read<rest, false>(src + first), extend_indices),
-                                  concat_indices);
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-CMT_INLINE void simd_write(T* dest, const simd_type<T, N>& value)
-{
-    ptr_cast<simd_storage<T, N, A>>(dest)->value = value;
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-CMT_INLINE void simd_write(T* dest, const simd_type<T, N>& value)
-{
-    constexpr size_t first = prev_poweroftwo(N);
-    constexpr size_t rest  = N - first;
-    simd_write<A, first>(dest, simd_shuffle(value, csizeseq_t<first>()));
-    simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq_t<rest, first>()));
-}
-} // namespace internal
-
-template <typename T, size_t N>
-struct alignas(alignof(internal::simd_type<T, N>)) vec : public vec_t<T, N>
-{
-    static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
-
-    // type and size
-    using value_type = T;
-    constexpr static size_t size() noexcept { return N; }
-
-    using scalar_type = T;
-    constexpr static size_t scalar_size() noexcept { return N; }
-
-    using mask_t = mask<T, N>;
-
-    using simd_type    = internal::simd_type<T, N>;
-    using uvalue_type  = utype<T>;
-    using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>;
-    using usimd_type   = internal::simd_type<uvalue_type, N>;
-    using iusimd_type  = internal::simd_type<iuvalue_type, N>;
-
-    // constructors and assignment
-    // default
-    constexpr vec() noexcept = default;
-    // copy
-    constexpr vec(const vec&) noexcept = default;
-    // assignment
-    constexpr vec& operator=(const vec&) noexcept = default;
-    // from scalar
-    template <typename U, typename = enable_if<(std::is_convertible<U, value_type>::value)>>
-    constexpr vec(const U& s) noexcept : simd(s)
-    {
-    }
-    // from list
-    template <typename... Us>
-    constexpr vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
-        : simd{ s0, s1, static_cast<value_type>(rest)... }
-    {
-    }
-    // from vector of another type
-    template <typename U, typename = enable_if<is_simd_type<U>::value>>
-    constexpr vec(const vec<U, N>& v) noexcept : simd(__builtin_convertvector(v.simd, simd_type))
-    {
-    }
-    constexpr vec(const simd_type& simd) noexcept : simd(simd) {}
-    // from list of vectors
-    template <size_t... Ns, typename = enable_if<csum<size_t, Ns...>() == N>>
-    constexpr vec(const vec<T, Ns>&... vs) noexcept : simd(*concat(vs...))
-    {
-    }
-    constexpr vec(czeros_t) noexcept : simd(0) {}
-    constexpr vec(cones_t) noexcept : simd(*(vec() == vec())) {}
-
-    template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
-    constexpr static vec frombits(const vec<U, M>& v) noexcept
-    {
-        return (simd_type)(v.flatten().simd);
-    }
-
-#define KFR_U(x) ((usimd_type)(x))
-#define KFR_IU(x) ((iusimd_type)(x))
-#define KFR_S(x) ((simd_type)(x))
-
-    // math / bitwise / comparison operators
-    constexpr friend vec operator+(const vec& x) noexcept { return x; }
-    constexpr friend vec operator-(const vec& x) noexcept { return KFR_S(-*x); }
-    constexpr friend vec operator~(const vec& x) noexcept { return KFR_S(~KFR_U(*x)); }
-
-    constexpr friend vec operator+(const vec& x, const vec& y) noexcept { return *x + *y; }
-    constexpr friend vec operator-(const vec& x, const vec& y) noexcept { return *x - *y; }
-    constexpr friend vec operator*(const vec& x, const vec& y) noexcept { return *x * *y; }
-    constexpr friend vec operator/(const vec& x, const vec& y) noexcept { return *x / *y; }
-
-    constexpr friend vec operator<<(const vec& x, int shift) noexcept { return KFR_S(KFR_IU(*x) << shift); }
-    constexpr friend vec operator>>(const vec& x, int shift) noexcept { return KFR_S(KFR_IU(*x) >> shift); }
-    constexpr friend vec operator&(const vec& x, const vec& y) noexcept
-    {
-        return KFR_S(KFR_U(*x) & KFR_U(*y));
-    }
-    constexpr friend vec operator|(const vec& x, const vec& y) noexcept
-    {
-        return KFR_S(KFR_U(*x) | KFR_U(*y));
-    }
-    constexpr friend vec operator^(const vec& x, const vec& y) noexcept
-    {
-        return KFR_S(KFR_U(*x) ^ KFR_U(*y));
-    }
-
-    constexpr friend mask_t operator==(const vec& x, const vec& y) noexcept { return KFR_S(*x == *y); }
-    constexpr friend mask_t operator!=(const vec& x, const vec& y) noexcept { return KFR_S(*x != *y); }
-    constexpr friend mask_t operator<(const vec& x, const vec& y) noexcept { return KFR_S(*x < *y); }
-    constexpr friend mask_t operator>(const vec& x, const vec& y) noexcept { return KFR_S(*x > *y); }
-    constexpr friend mask_t operator<=(const vec& x, const vec& y) noexcept { return KFR_S(*x <= *y); }
-    constexpr friend mask_t operator>=(const vec& x, const vec& y) noexcept { return KFR_S(*x >= *y); }
-
-    constexpr mask_t asmask() const noexcept { return mask_t(simd); }
-
-#undef KFR_S
-#undef KFR_U
-
-    constexpr friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; }
-    constexpr friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; }
-    constexpr friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; }
-    constexpr friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; }
-
-    constexpr friend vec& operator<<=(vec& x, int shift) noexcept { return x = x << shift; }
-    constexpr friend vec& operator>>=(vec& x, int shift) noexcept { return x = x >> shift; }
-    constexpr friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; }
-    constexpr friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; }
-    constexpr friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; }
-
-    constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
-    constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
-    constexpr friend vec operator++(vec& x, int) noexcept
-    {
-        const vec z = x;
-        ++x;
-        return z;
-    }
-    constexpr friend vec operator--(vec& x, int) noexcept
-    {
-        const vec z = x;
-        --x;
-        return z;
-    }
-
-    // shuffle
-    template <size_t... indices>
-    vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
-    {
-        return __builtin_shufflevector(simd, simd, (indices >= N ? -1 : int(indices))...);
-    }
-    template <size_t... indices>
-    vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
-    {
-        return __builtin_shufflevector(simd, y.simd, (indices >= N * 2 ? -1 : int(indices))...);
-    }
-
-    // element access
-    struct element;
-    constexpr value_type operator[](size_t index) const& noexcept { return get(index); }
-    constexpr value_type operator[](size_t index) && noexcept { return get(index); }
-    constexpr element operator[](size_t index) & noexcept { return { *this, index }; }
-
-    constexpr value_type get(size_t index) const noexcept { return simd[index]; }
-    constexpr void set(size_t index, const value_type& s) noexcept { simd[index] = s; }
-    template <size_t index>
-    constexpr value_type get(csize_t<index>) const noexcept
-    {
-        return simd[index];
-    }
-    template <size_t index>
-    constexpr void set(csize_t<index>, const value_type& s) noexcept
-    {
-        simd[index] = s;
-    }
-    struct element
-    {
-        constexpr operator value_type() const noexcept { return v.get(index); }
-        element& operator=(const value_type& s) noexcept
-        {
-            v.set(index, s);
-            return *this;
-        }
-        element& operator=(const element& s) noexcept
-        {
-            v.set(index, static_cast<value_type>(s));
-            return *this;
-        }
-        template <typename U, size_t M>
-        element& operator=(const typename vec<U, M>::element& s) noexcept
-        {
-            v.set(index, static_cast<value_type>(static_cast<U>(s)));
-            return *this;
-        }
-        vec& v;
-        size_t index;
-    };
-
-    // read/write
-    template <bool aligned = false>
-    explicit constexpr vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept
-        : simd(internal::simd_read<N, aligned>(src))
-    {
-    }
-    template <bool aligned = false>
-    const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept
-    {
-        internal::simd_write<aligned, N>(dest, simd);
-        return *this;
-    }
-
-    // native SIMD type access
-    const vec& flatten() const noexcept { return *this; }
-    simd_type operator*() const noexcept { return simd; }
-    simd_type& operator*() noexcept { return simd; }
-
-protected:
-    template <typename U, size_t M>
-    friend struct vec;
-
-    simd_type simd;
-
-private:
-};
-
-namespace internal
-{
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x)
-{
-    return x;
-}
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N * 2> concat_impl(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x.shuffle(y, csizeseq_t<N * 2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
-    return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<N1 + N2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
-    return x.shuffle(csizeseq_t<N2, -(N2 - N1)>())
-        .shuffle(y, csizeseq_t<N2 * 2>())
-        .shuffle(csizeseq_t<N1 + N2, N2 - N1>());
-}
-
-template <typename T, size_t N1, size_t N2, size_t... Sizes>
-CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y,
-                                                                const vec<T, Sizes>&... args)
-{
-    return concat_impl(concat_impl(x, y), args...);
-}
-} // namespace internal
-
-template <typename T, size_t... Ns>
-constexpr inline vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept
-{
-    return internal::concat_impl(vs...);
-}
-} // namespace kfr
-
-CMT_PRAGMA_MSVC(warning(pop))
-
-#endif
diff --git a/include/kfr/base/simd_intrin.hpp b/include/kfr/base/simd_intrin.hpp
@@ -1,392 +0,0 @@
-/** @addtogroup types
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "kfr.h"
-
-#include "constants.hpp"
-#include "platform.hpp"
-#include "types.hpp"
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4324))
-CMT_PRAGMA_MSVC(warning(disable : 4814))
-
-#ifdef CMT_INTRINSICS_IS_CONSTEXPR
-#define KFR_I_CE constexpr
-#else
-#define KFR_I_CE
-#endif
-
-namespace kfr
-{
-
-template <typename T, size_t... Ns>
-constexpr vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept;
-
-#define KFR_NATIVE_INTRINSICS 1
-
-template <typename T, size_t N>
-struct simd_type_holder
-{
-    using type = struct
-    {
-        T v[N];
-    };
-};
-
-#define KFR_SIMD_SPEC_TYPE(T, N, MM)                                                                         \
-    template <>                                                                                              \
-    struct simd_type_holder<T, N>                                                                            \
-    {                                                                                                        \
-        using type = MM;                                                                                     \
-    };
-
-#ifdef CMT_ARCH_SSE2
-KFR_SIMD_SPEC_TYPE(u8, 16, __m128i);
-KFR_SIMD_SPEC_TYPE(u16, 8, __m128i);
-KFR_SIMD_SPEC_TYPE(u32, 4, __m128i);
-KFR_SIMD_SPEC_TYPE(u64, 2, __m128i);
-KFR_SIMD_SPEC_TYPE(i8, 16, __m128i);
-KFR_SIMD_SPEC_TYPE(i16, 8, __m128i);
-KFR_SIMD_SPEC_TYPE(i32, 4, __m128i);
-KFR_SIMD_SPEC_TYPE(i64, 2, __m128i);
-KFR_SIMD_SPEC_TYPE(f32, 4, __m128);
-KFR_SIMD_SPEC_TYPE(f64, 2, __m128d);
-#endif
-
-#ifdef CMT_ARCH_AVX
-KFR_SIMD_SPEC_TYPE(u8, 32, __m256i);
-KFR_SIMD_SPEC_TYPE(u16, 16, __m256i);
-KFR_SIMD_SPEC_TYPE(u32, 8, __m256i);
-KFR_SIMD_SPEC_TYPE(u64, 4, __m256i);
-KFR_SIMD_SPEC_TYPE(i8, 32, __m256i);
-KFR_SIMD_SPEC_TYPE(i16, 16, __m256i);
-KFR_SIMD_SPEC_TYPE(i32, 8, __m256i);
-KFR_SIMD_SPEC_TYPE(i64, 4, __m256i);
-KFR_SIMD_SPEC_TYPE(f32, 8, __m256);
-KFR_SIMD_SPEC_TYPE(f64, 4, __m256d);
-#endif
-
-#ifdef CMT_ARCH_AVX512
-KFR_SIMD_SPEC_TYPE(u8, 64, __m512i);
-KFR_SIMD_SPEC_TYPE(u16, 32, __m512i);
-KFR_SIMD_SPEC_TYPE(u32, 16, __m512i);
-KFR_SIMD_SPEC_TYPE(u64, 8, __m512i);
-KFR_SIMD_SPEC_TYPE(i8, 64, __m512i);
-KFR_SIMD_SPEC_TYPE(i16, 32, __m512i);
-KFR_SIMD_SPEC_TYPE(i32, 16, __m512i);
-KFR_SIMD_SPEC_TYPE(i64, 8, __m512i);
-KFR_SIMD_SPEC_TYPE(f32, 16, __m512);
-KFR_SIMD_SPEC_TYPE(f64, 8, __m512d);
-#endif
-
-#ifdef CMT_ARCH_NEON
-KFR_SIMD_SPEC_TYPE(u8, 16, uint8x16_t);
-KFR_SIMD_SPEC_TYPE(u16, 8, uint16x8_t);
-KFR_SIMD_SPEC_TYPE(u32, 4, uint32x4_t);
-KFR_SIMD_SPEC_TYPE(u64, 2, uint64x2_t);
-KFR_SIMD_SPEC_TYPE(i8, 16, int8x16_t);
-KFR_SIMD_SPEC_TYPE(i16, 8, int16x8_t);
-KFR_SIMD_SPEC_TYPE(i32, 4, int32x4_t);
-KFR_SIMD_SPEC_TYPE(i64, 2, int64x2_t);
-KFR_SIMD_SPEC_TYPE(f32, 4, float32x4_t);
-#ifdef CMT_ARCH_NEON64
-KFR_SIMD_SPEC_TYPE(f64, 2, float64x2_t);
-#endif
-#endif
-
-template <size_t N>
-struct raw_bytes
-{
-    u8 raw[N];
-};
-
-#define KFR_CYCLE(...)                                                                                       \
-    for (size_t i = 0; i < N; i++)                                                                           \
-    __VA_ARGS__
-
-#define KFR_C_CYCLE(...)                                                                                     \
-    for (size_t i = 0; i < N; i++)                                                                           \
-    vs[i] = __VA_ARGS__
-
-#define KFR_R_CYCLE(...)                                                                                     \
-    vec<T, N> result;                                                                                        \
-    for (size_t i = 0; i < N; i++)                                                                           \
-        result.vs[i] = __VA_ARGS__;                                                                          \
-    return result
-
-#define KFR_B_CYCLE(...)                                                                                     \
-    vec<T, N> result;                                                                                        \
-    for (size_t i = 0; i < N; i++)                                                                           \
-        result.vs[i] = (__VA_ARGS__) ? constants<value_type>::allones() : value_type(0);                     \
-    return result.asmask()
-
-template <typename T, size_t N>
-struct alignas(const_min(platform<>::maximum_vector_alignment, sizeof(T) * next_poweroftwo(N))) vec
-    : vec_t<T, N>
-{
-    constexpr static size_t simd_width = platform<T>::vector_width;
-    constexpr static size_t count      = (N + simd_width - 1) / simd_width;
-
-    static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
-
-    // type and size
-    using value_type = T;
-    constexpr static size_t size() noexcept { return N; }
-
-    using scalar_type = T;
-    constexpr static size_t scalar_size() noexcept { return N; }
-
-    using simd_type = typename simd_type_holder<T, N>::type;
-
-    using uvalue_type  = utype<T>;
-    using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>;
-
-    using mask_t = mask<T, N>;
-
-    using uvec  = vec<uvalue_type, N>;
-    using iuvec = vec<iuvalue_type, N>;
-
-    // constructors and assignment
-    // default
-    constexpr vec() noexcept = default;
-    // copy
-    vec(const vec&) noexcept = default;
-    // assignment
-    CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default;
-
-    template <size_t... indices>
-    KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
-    {
-        return vec<value_type, sizeof...(indices)>((indices < N ? vs[indices % N] : 0)...);
-    }
-    template <size_t... indices>
-    KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
-    {
-        return vec<value_type, sizeof...(indices)>(
-            (indices < N ? vs[indices % N] : indices < 2 * N ? y.vs[(indices - N) % N] : 0)...);
-    }
-
-    template <typename U, typename = enable_if<(std::is_convertible<U, value_type>::value)>>
-    KFR_I_CE vec(const U& s) noexcept
-    {
-        KFR_C_CYCLE(s);
-    }
-
-    constexpr vec(const simd_type& simd) noexcept : simd(simd) {}
-    // from vector of another type
-    template <typename U, typename = enable_if<is_simd_type<U>::value>>
-    KFR_I_CE vec(const vec<U, N>& v) noexcept
-    {
-        KFR_C_CYCLE(static_cast<value_type>(v.vs[i]));
-    }
-    // from list
-    template <typename... Us>
-    KFR_I_CE vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
-        : vs{ s0, s1, static_cast<value_type>(rest)... }
-    {
-    }
-    template <size_t N1, size_t... Ns, typename = enable_if<(csum<size_t, N1, Ns...>() == N)>>
-    KFR_I_CE vec(const vec<T, N1>& v0, const vec<T, Ns>&... vecs) noexcept : simd(*concat(v0, vecs...))
-    {
-    }
-
-    KFR_I_CE vec(czeros_t) noexcept { KFR_C_CYCLE(value_type(0)); }
-    KFR_I_CE vec(cones_t) noexcept { KFR_C_CYCLE(constants<value_type>::allones()); }
-
-    template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
-    KFR_I_CE static vec frombits(const vec<U, M>& v) noexcept
-    {
-        vec r;
-        r.bytes = v.flatten().bytes;
-        return r;
-    }
-
-    KFR_I_CE vec operator+() const noexcept { return *this; }
-    KFR_I_CE vec operator-() const noexcept { KFR_R_CYCLE(-this->vs[i]); }
-    KFR_I_CE vec operator~() const noexcept
-    {
-        uvec xx = uvec::frombits(*this);
-        KFR_CYCLE(xx.vs[i] = ~xx.vs[i]);
-        return frombits(xx);
-    }
-
-    KFR_I_CE vec operator+(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] + y.vs[i]); }
-    KFR_I_CE vec operator-(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] - y.vs[i]); }
-    KFR_I_CE vec operator*(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] * y.vs[i]); }
-    KFR_I_CE vec operator/(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] / y.vs[i]); }
-
-    KFR_I_CE vec operator<<(int shift) const noexcept
-    {
-        iuvec xx = iuvec::frombits(*this);
-        KFR_CYCLE(xx.vs[i] <<= shift);
-        return frombits(xx);
-    }
-    KFR_I_CE vec operator>>(int shift) const noexcept
-    {
-        iuvec xx = iuvec::frombits(*this);
-        KFR_CYCLE(xx.vs[i] >>= shift);
-        return frombits(xx);
-    }
-    KFR_I_CE vec operator&(const vec& y) const noexcept
-    {
-        uvec xx = uvec::frombits(*this);
-        uvec yy = uvec::frombits(y);
-        KFR_CYCLE(xx.vs[i] &= yy.vs[i]);
-        return frombits(xx);
-    }
-    KFR_I_CE vec operator|(const vec& y) const noexcept
-    {
-        uvec xx = uvec::frombits(*this);
-        uvec yy = uvec::frombits(y);
-        KFR_CYCLE(xx.vs[i] |= yy.vs[i]);
-        return frombits(xx);
-    }
-    KFR_I_CE vec operator^(const vec& y) const noexcept
-    {
-        uvec xx = uvec::frombits(*this);
-        uvec yy = uvec::frombits(y);
-        KFR_CYCLE(xx.vs[i] ^= yy.vs[i]);
-        return frombits(xx);
-    }
-
-    KFR_I_CE mask_t operator==(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] == y.vs[i]); }
-    KFR_I_CE mask_t operator!=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] != y.vs[i]); }
-    KFR_I_CE mask_t operator<(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] < y.vs[i]); }
-    KFR_I_CE mask_t operator>(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] > y.vs[i]); }
-    KFR_I_CE mask_t operator<=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] <= y.vs[i]); }
-    KFR_I_CE mask_t operator>=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] >= y.vs[i]); }
-
-    constexpr mask_t asmask() const noexcept { return mask_t(simd); }
-
-    KFR_I_CE vec& operator+=(const vec& y) noexcept { return *this = *this + y; }
-    KFR_I_CE vec& operator-=(const vec& y) noexcept { return *this = *this - y; }
-    KFR_I_CE vec& operator*=(const vec& y) noexcept { return *this = *this * y; }
-    KFR_I_CE vec& operator/=(const vec& y) noexcept { return *this = *this / y; }
-    KFR_I_CE vec& operator<<=(int shift) noexcept { return *this = *this << shift; }
-    KFR_I_CE vec& operator>>=(int shift) noexcept { return *this = *this >> shift; }
-    KFR_I_CE vec& operator&=(const vec& y) noexcept { return *this = *this & y; }
-    KFR_I_CE vec& operator|=(const vec& y) noexcept { return *this = *this | y; }
-    KFR_I_CE vec& operator^=(const vec& y) noexcept { return *this = *this ^ y; }
-
-    KFR_I_CE vec& operator++() noexcept { return *this = *this + vec(1); }
-    KFR_I_CE vec& operator--() noexcept { return *this = *this - vec(1); }
-    KFR_I_CE vec operator++(int) noexcept
-    {
-        const vec z = *this;
-        ++*this;
-        return z;
-    }
-    KFR_I_CE vec operator--(int) noexcept
-    {
-        const vec z = *this;
-        --*this;
-        return z;
-    }
-
-    explicit KFR_I_CE vec(const value_type* src) { KFR_C_CYCLE(src[i]); }
-    explicit KFR_I_CE vec(const value_type* src, cunaligned_t) { KFR_C_CYCLE(src[i]); }
-    explicit KFR_I_CE vec(const value_type* src, caligned_t) { KFR_C_CYCLE(src[i]); }
-
-    const vec& write(value_type* dest) const
-    {
-        KFR_CYCLE(dest[i] = vs[i]);
-        return *this;
-    }
-    const vec& write(value_type* dest, cunaligned_t) const
-    {
-        KFR_CYCLE(dest[i] = vs[i]);
-        return *this;
-    }
-    const vec& write(value_type* dest, caligned_t) const
-    {
-        KFR_CYCLE(dest[i] = vs[i]);
-        return *this;
-    }
-
-    KFR_I_CE value_type operator[](size_t index) const noexcept { return vs[index]; }
-    KFR_I_CE value_type& operator[](size_t index) noexcept { return vs[index]; }
-
-    const vec& flatten() const noexcept { return *this; }
-    simd_type operator*() const noexcept { return simd; }
-    simd_type& operator*() noexcept { return simd; }
-
-protected:
-    template <typename, size_t>
-    friend struct vec;
-
-    union {
-        T vs[N];
-        simd_type simd;
-        raw_bytes<N * sizeof(T)> bytes;
-    };
-};
-
-namespace internal
-{
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x)
-{
-    return x;
-}
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N * 2> concat_impl(const vec<T, N>& x, const vec<T, N>& y)
-{
-    return x.shuffle(y, csizeseq_t<N * 2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
-    return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<N1 + N2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
-    return x.shuffle(csizeseq_t<N2, -(N2 - N1)>())
-        .shuffle(y, csizeseq_t<N2 * 2>())
-        .shuffle(csizeseq_t<N1 + N2, N2 - N1>());
-}
-
-template <typename T, size_t N1, size_t N2, size_t... Sizes>
-CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y,
-                                                                const vec<T, Sizes>&... args)
-{
-    return concat_impl(concat_impl(x, y), args...);
-}
-} // namespace internal
-
-template <typename T, size_t... Ns>
-constexpr inline vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept
-{
-    return internal::concat_impl(vs...);
-}
-} // namespace kfr
-
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/simd_x86.hpp b/include/kfr/base/simd_x86.hpp
@@ -1,272 +0,0 @@
-#pragma once
-
-#include "constants.hpp"
-#include "platform.hpp"
-#include "simd_intrin.hpp"
-namespace kfr
-{
-#ifdef CMT_ARCH_SSE2
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator+(const vec<f32, 4>& y) const noexcept
-{
-    return _mm_add_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator-(const vec<f32, 4>& y) const noexcept
-{
-    return _mm_sub_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator*(const vec<f32, 4>& y) const noexcept
-{
-    return _mm_mul_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator/(const vec<f32, 4>& y) const noexcept
-{
-    return _mm_div_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator&(const vec<f32, 4>& y) const noexcept
-{
-    return _mm_and_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator|(const vec<f32, 4>& y) const noexcept
-{
-    return _mm_or_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator^(const vec<f32, 4>& y) const noexcept
-{
-    return _mm_xor_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator+(const vec<f64, 2>& y) const noexcept
-{
-    return _mm_add_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator-(const vec<f64, 2>& y) const noexcept
-{
-    return _mm_sub_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator*(const vec<f64, 2>& y) const noexcept
-{
-    return _mm_mul_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator/(const vec<f64, 2>& y) const noexcept
-{
-    return _mm_div_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator&(const vec<f64, 2>& y) const noexcept
-{
-    return _mm_and_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator|(const vec<f64, 2>& y) const noexcept
-{
-    return _mm_or_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator^(const vec<f64, 2>& y) const noexcept
-{
-    return _mm_xor_pd(simd, y.simd);
-}
-
-#endif // CMT_ARCH_SSE2
-
-#ifdef CMT_ARCH_AVX
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator+(const vec<f32, 8>& y) const noexcept
-{
-    return _mm256_add_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator-(const vec<f32, 8>& y) const noexcept
-{
-    return _mm256_sub_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator*(const vec<f32, 8>& y) const noexcept
-{
-    return _mm256_mul_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator/(const vec<f32, 8>& y) const noexcept
-{
-    return _mm256_div_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator&(const vec<f32, 8>& y) const noexcept
-{
-    return _mm256_and_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator|(const vec<f32, 8>& y) const noexcept
-{
-    return _mm256_or_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator^(const vec<f32, 8>& y) const noexcept
-{
-    return _mm256_xor_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator+(const vec<f64, 4>& y) const noexcept
-{
-    return _mm256_add_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator-(const vec<f64, 4>& y) const noexcept
-{
-    return _mm256_sub_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator*(const vec<f64, 4>& y) const noexcept
-{
-    return _mm256_mul_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator/(const vec<f64, 4>& y) const noexcept
-{
-    return _mm256_div_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator&(const vec<f64, 4>& y) const noexcept
-{
-    return _mm256_and_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator|(const vec<f64, 4>& y) const noexcept
-{
-    return _mm256_or_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator^(const vec<f64, 4>& y) const noexcept
-{
-    return _mm256_xor_pd(simd, y.simd);
-}
-
-#endif // CMT_ARCH_AVX
-
-#ifdef CMT_ARCH_AVX512
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator+(const vec<f32, 16>& y) const noexcept
-{
-    return _mm512_add_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator-(const vec<f32, 16>& y) const noexcept
-{
-    return _mm512_sub_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator*(const vec<f32, 16>& y) const noexcept
-{
-    return _mm512_mul_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator/(const vec<f32, 16>& y) const noexcept
-{
-    return _mm512_div_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator&(const vec<f32, 16>& y) const noexcept
-{
-    return _mm512_and_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator|(const vec<f32, 16>& y) const noexcept
-{
-    return _mm512_or_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator^(const vec<f32, 16>& y) const noexcept
-{
-    return _mm512_xor_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator+(const vec<f64, 8>& y) const noexcept
-{
-    return _mm512_add_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator-(const vec<f64, 8>& y) const noexcept
-{
-    return _mm512_sub_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator*(const vec<f64, 8>& y) const noexcept
-{
-    return _mm512_mul_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator/(const vec<f64, 8>& y) const noexcept
-{
-    return _mm512_div_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator&(const vec<f64, 8>& y) const noexcept
-{
-    return _mm512_and_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator|(const vec<f64, 8>& y) const noexcept
-{
-    return _mm512_or_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator^(const vec<f64, 8>& y) const noexcept
-{
-    return _mm512_xor_pd(simd, y.simd);
-}
-
-#endif // CMT_ARCH_AVX
-
-} // namespace kfr
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -1,315 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/sin_cos.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the trigonometric sine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sin(const T1& x)
-{
-    return intrinsics::sin(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sin, E1> sin(E1&& x)
-{
-    return { fn::sin(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cos(const T1& x)
-{
-    return intrinsics::cos(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cos, E1> cos(E1&& x)
-{
-    return { fn::cos(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric sine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastsin(const T1& x)
-{
-    return intrinsics::fastsin(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric sine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastsin, E1> fastsin(E1&& x)
-{
-    return { fn::fastsin(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric cosine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastcos(const T1& x)
-{
-    return intrinsics::fastcos(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric cosine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastcos, E1> fastcos(E1&& x)
-{
-    return { fn::fastcos(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
- * be a vector.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sincos(const T1& x)
-{
-    return intrinsics::sincos(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of the even elements of the x and
- * cosine of the odd elements. x must be a vector.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sincos, E1> sincos(E1&& x)
-{
-    return { fn::sincos(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
- * be a vector.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cossin(const T1& x)
-{
-    return intrinsics::cossin(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of the even elements of the x and
- * sine of the odd elements. x must be a vector.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cossin, E1> cossin(E1&& x)
-{
-    return { fn::cossin(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sindeg(const T1& x)
-{
-    return intrinsics::sindeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of the x (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sindeg, E1> sindeg(E1&& x)
-{
-    return { fn::sindeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cosdeg(const T1& x)
-{
-    return intrinsics::cosdeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of the x (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cosdeg, E1> cosdeg(E1&& x)
-{
-    return { fn::cosdeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric sine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastsindeg(const T1& x)
-{
-    return intrinsics::fastsindeg(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric sine of the x
- * (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastsindeg, E1> fastsindeg(E1&& x)
-{
-    return { fn::fastsindeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric cosine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastcosdeg(const T1& x)
-{
-    return intrinsics::fastcosdeg(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric cosine of the x
- * (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastcosdeg, E1> fastcosdeg(E1&& x)
-{
-    return { fn::fastcosdeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
- * be a vector and expressed in degrees.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sincosdeg(const T1& x)
-{
-    return intrinsics::sincosdeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of the even elements of the x and
- * cosine of the odd elements. x must be expressed in degrees.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sincosdeg, E1> sincosdeg(E1&& x)
-{
-    return { fn::sincosdeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
- * be a vector and expressed in degrees.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cossindeg(const T1& x)
-{
-    return intrinsics::cossindeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of the even elements of the x and
- * sine of the odd elements. x must be expressed in degrees.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cossindeg, E1> cossindeg(E1&& x)
-{
-    return { fn::cossindeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the sinc function of x.
- * \f[
- * sinc(x) = \frac{sin(x)}{x}
- * \f]
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sinc(const T1& x)
-{
-    return intrinsics::sinc(x);
-}
-
-/**
- * @brief Returns template expression that returns the sinc function of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinc, E1> sinc(E1&& x)
-{
-    return { fn::sinc(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the angle 2x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T sin2x(const T& sinx, const T& cosx)
-{
-    return 2 * sinx * cosx;
-}
-
-/**
- * @brief Returns the trigonometric sine of the angle 3x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T sin3x(const T& sinx, const T& cosx)
-{
-    return sinx * (-1 + 4 * sqr(cosx));
-}
-
-/**
- * @brief Returns the trigonometric cosine of the angle 2x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T cos2x(const T& sinx, const T& cosx)
-{
-    return sqr(cosx) - sqr(sinx);
-}
-
-/**
- * @brief Returns the trigonometric cosine of the angle 3x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T cos3x(const T& sinx, const T& cosx)
-{
-    return cosx * (1 - 4 * sqr(sinx));
-}
-} // namespace kfr
diff --git a/include/kfr/base/small_buffer.hpp b/include/kfr/base/small_buffer.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup utility
+/** @addtogroup types
  *  @{
  */
 /*
@@ -31,16 +31,15 @@
 
 namespace kfr
 {
-
 template <typename T, std::size_t Capacity = 16>
 struct small_buffer
 {
 public:
-    small_buffer() noexcept : m_size(0), m_data(m_preallocated) {}
+    small_buffer() CMT_NOEXCEPT : m_size(0), m_data(m_preallocated) {}
 
     small_buffer(std::size_t size) : small_buffer() { resize(size); }
 
-    friend void swap(small_buffer<T, Capacity>& first, small_buffer<T, Capacity>& second) noexcept
+    friend void swap(small_buffer<T, Capacity>& first, small_buffer<T, Capacity>& second) CMT_NOEXCEPT
     {
         using std::swap;
 
diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp
@@ -25,12 +25,15 @@
  */
 #pragma once
 
-#include "min_max.hpp"
-#include "shuffle.hpp"
-#include "vec.hpp"
+#include "../math/min_max.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
 /**
  * @brief Sort the elements in the vector in ascending order
  * @param x input vector
@@ -40,12 +43,12 @@ namespace kfr
  * @endcode
  */
 template <typename T, size_t N>
-CMT_INLINE vec<T, N> sort(const vec<T, N>& x)
+KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x)
 {
     constexpr size_t Nhalf = N / 2;
     vec<T, Nhalf> e        = low(x);
     vec<T, Nhalf> o        = high(x);
-    constexpr auto blend0  = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>());
+    constexpr auto blend0  = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
     for (size_t i = 0; i < Nhalf; i++)
     {
         vec<T, Nhalf> t;
@@ -73,12 +76,12 @@ CMT_INLINE vec<T, N> sort(const vec<T, N>& x)
  * @endcode
  */
 template <typename T, size_t N>
-CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x)
+KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x)
 {
     constexpr size_t Nhalf = N / 2;
     vec<T, Nhalf> e        = low(x);
     vec<T, Nhalf> o        = high(x);
-    constexpr auto blend0  = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>());
+    constexpr auto blend0  = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
     for (size_t i = 0; i < Nhalf; i++)
     {
         vec<T, Nhalf> t;
@@ -96,4 +99,5 @@ CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x)
     }
     return interleavehalfs(concat(e, o));
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/base/specializations.i b/include/kfr/base/specializations.i
@@ -1,109 +0,0 @@
-/**
- * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
- * This file is part of KFR
- * 
- * KFR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * KFR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with KFR.
- */
-#pragma once
-
-#include "vec.hpp"
-#ifndef KFR_SHUFFLE_SPECIALIZATIONS
-#include "shuffle.hpp"
-#endif
-
-#ifdef KFR_COMPILER_GNU
-
-namespace kfr
-{
-namespace internal
-{
-template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
-    csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14,
-             15, 22, 23, 30, 31>,
-    const vec<f32, 32>& x, const vec<f32, 32>&)
-{
-    f32x32 w = x;
-
-    w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)),
-               permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w)));
-
-    w = permutegroups<(4), 0, 4, 2, 6, 1, 5, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
-    return w;
-}
-
-template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
-    csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22,
-             23, 14, 15, 30, 31>,
-    const vec<f32, 32>& x, const vec<f32, 32>&)
-{
-    f32x32 w = x;
-
-    w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)),
-               permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w)));
-
-    w = permutegroups<(4), 0, 4, 1, 5, 2, 6, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
-    return w;
-}
-
-inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x)
-{
-    return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18,
-                                         19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>,
-                                  x, x);
-}
-
-template <>
-inline vec<f32, 64> shufflevector<f32, 64>(
-    csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13,
-             44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38,
-             39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>,
-    const vec<f32, 64>& x, const vec<f32, 64>&)
-{
-    return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>(concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x))));
-}
-
-template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>,
-                                           const vec<f32, 16>& x, const vec<f32, 16>&)
-{
-//    asm volatile("int $3");
-    const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x);
-
-    return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)), shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx)));
-}
-
-template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>,
-                                           const vec<f32, 16>& x, const vec<f32, 16>&)
-{
-    const vec<f32, 16> xx = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
-
-    return permutegroups<(4), 0, 2, 1, 3>(xx);
-}
-
-template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
-    csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
-             29, 14, 30, 15, 31>,
-    const vec<f32, 32>& x, const vec<f32, 32>&)
-{
-    const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x);
-
-    return concat(interleavehalfs(low(xx)), interleavehalfs(high(xx)));
-}
-}
-}
-#endif
diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp
@@ -1,50 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/sqrt.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the positive square root of the x. \f$\sqrt{x}\f$
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> sqrt(const T1& x)
-{
-    return intrinsics::sqrt(x);
-}
-
-/**
- * @brief Returns template expression that returns the positive square root of the x. \f$\sqrt{x}\f$
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::sqrt, E1> sqrt(E1&& x)
-{
-    return { fn::sqrt(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp
@@ -1,56 +0,0 @@
-/** @addtogroup math
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/tan.hpp"
-
-namespace kfr
-{
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> tan(const T1& x)
-{
-    return intrinsics::tan(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::tan, E1> tan(E1&& x)
-{
-    return { fn::tan(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> tandeg(const T1& x)
-{
-    return intrinsics::tandeg(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::tandeg, E1> tandeg(E1&& x)
-{
-    return { fn::tandeg(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -1,429 +0,0 @@
-/** @addtogroup types
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "kfr.h"
-
-#include "intrinsics.h"
-
-#include <cmath>
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
-
-#ifdef KFR_TESTING
-#include "../testo/testo.hpp"
-#endif
-
-#include "../cometa.hpp"
-
-#define KFR_ENABLE_IF CMT_ENABLE_IF
-
-/**
- *  @brief Internal macro for functions
- */
-#define KFR_FN(FN)                                                                                           \
-    namespace fn                                                                                             \
-    {                                                                                                        \
-    struct FN                                                                                                \
-    {                                                                                                        \
-        template <typename... Args>                                                                          \
-        CMT_INLINE_MEMBER decltype(::kfr::FN(std::declval<Args>()...)) operator()(Args&&... args) const      \
-        {                                                                                                    \
-            return ::kfr::FN(std::forward<Args>(args)...);                                                   \
-        }                                                                                                    \
-    };                                                                                                       \
-    }
-
-/**
- *  @brief Internal macro for functions
- */
-#define KFR_I_FN(FN)                                                                                         \
-    namespace fn                                                                                             \
-    {                                                                                                        \
-    struct FN                                                                                                \
-    {                                                                                                        \
-        template <typename... Args>                                                                          \
-        CMT_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()(               \
-            Args&&... args) const                                                                            \
-        {                                                                                                    \
-            return ::kfr::intrinsics::FN(std::forward<Args>(args)...);                                       \
-        }                                                                                                    \
-    };                                                                                                       \
-    }
-
-namespace kfr
-{
-// Include all from CoMeta library
-using namespace cometa;
-
-/// @brief Short names for common types
-using f32  = float;
-using f64  = double;
-using i8   = int8_t;
-using i16  = int16_t;
-using i32  = int32_t;
-using i64  = int64_t;
-using u8   = uint8_t;
-using u16  = uint16_t;
-using u32  = uint32_t;
-using u64  = uint64_t;
-using umax = uint64_t;
-using imax = int64_t;
-using fmax = double;
-using f80  = long double;
-
-#if defined(KFR_BASETYPE_F32) || defined(KFR_NO_NATIVE_F64)
-/// @brief Floating point type used by default
-using fbase = f32;
-#else
-/// @brief Floating point type used by default
-using fbase = f64;
-#endif
-
-constexpr ctype_t<f32> ctype_f32{};
-constexpr ctype_t<f64> ctype_f64{};
-constexpr ctype_t<i8> ctype_i8{};
-constexpr ctype_t<i16> ctype_i16{};
-constexpr ctype_t<i32> ctype_i32{};
-constexpr ctype_t<i64> ctype_i64{};
-constexpr ctype_t<u8> ctype_u8{};
-constexpr ctype_t<u16> ctype_u16{};
-constexpr ctype_t<u32> ctype_u32{};
-constexpr ctype_t<u64> ctype_u64{};
-constexpr ctype_t<umax> ctype_umax{};
-constexpr ctype_t<imax> ctype_imax{};
-constexpr ctype_t<fmax> ctype_fmax{};
-constexpr ctype_t<f80> ctype_f80{};
-constexpr ctype_t<fbase> ctype_base{};
-
-struct u24
-{
-    u8 raw[3];
-};
-
-struct i24
-{
-    u8 raw[3];
-
-    i24(i32 x)
-    {
-        raw[0] = x & 0xFF;
-        raw[1] = (x >> 8) & 0xFF;
-        raw[2] = (x >> 16) & 0xFF;
-    }
-
-    i32 as_int() const
-    {
-        return static_cast<i32>(raw[0]) | static_cast<i32>(raw[1] << 8) |
-               (static_cast<i32>(raw[2] << 24) >> 8);
-    }
-
-    operator int() const { return as_int(); }
-};
-
-struct f16
-{
-    u16 raw;
-};
-
-/// @brief An enumeration representing data type
-template <typename T1>
-struct range
-{
-    T1 min;
-    T1 max;
-    T1 distance() const { return max - min; }
-};
-
-/// @brief An enumeration representing data type
-enum class datatype : int
-{
-    typebits_mask       = 0xFF,
-    f                   = 0x100,
-    i                   = 0x200,
-    u                   = 0x300,
-    c                   = 0x400,
-    typeclass_mask      = 0xF00,
-    x1                  = 0x1000,
-    x2                  = 0x2000,
-    x3                  = 0x3000,
-    x4                  = 0x4000,
-    typecomponents_mask = 0xF000,
-    f16                 = static_cast<int>(f) | static_cast<int>(x1) | 16,
-    f32                 = static_cast<int>(f) | static_cast<int>(x1) | 32,
-    f64                 = static_cast<int>(f) | static_cast<int>(x1) | 64,
-    f80                 = static_cast<int>(f) | static_cast<int>(x1) | 80,
-    i8                  = static_cast<int>(i) | static_cast<int>(x1) | 8,
-    i16                 = static_cast<int>(i) | static_cast<int>(x1) | 16,
-    i24                 = static_cast<int>(i) | static_cast<int>(x1) | 24,
-    i32                 = static_cast<int>(i) | static_cast<int>(x1) | 32,
-    i64                 = static_cast<int>(i) | static_cast<int>(x1) | 64,
-    u8                  = static_cast<int>(u) | static_cast<int>(x1) | 8,
-    u16                 = static_cast<int>(u) | static_cast<int>(x1) | 16,
-    u24                 = static_cast<int>(u) | static_cast<int>(x1) | 24,
-    u32                 = static_cast<int>(u) | static_cast<int>(x1) | 32,
-    u64                 = static_cast<int>(u) | static_cast<int>(x1) | 64,
-    c32                 = static_cast<int>(c) | static_cast<int>(x2) | 32,
-    c64                 = static_cast<int>(c) | static_cast<int>(x2) | 64
-};
-
-inline datatype operator|(datatype x, datatype y)
-{
-    using type = underlying_type<datatype>;
-    return static_cast<datatype>(static_cast<type>(x) | static_cast<type>(y));
-}
-
-inline datatype operator&(datatype x, datatype y)
-{
-    using type = underlying_type<datatype>;
-    return static_cast<datatype>(static_cast<type>(x) & static_cast<type>(y));
-}
-
-template <typename T>
-constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value
-                                   ? datatype::f
-                                   : std::is_integral<typename compound_type_traits<T>::subtype>::value
-                                         ? (std::is_unsigned<typename compound_type_traits<T>::subtype>::value
-                                                ? datatype::u
-                                                : datatype::i)
-                                         : datatype();
-
-template <typename T>
-using is_f_class = std::integral_constant<bool, typeclass<T> == datatype::f>;
-template <typename T>
-using is_u_class = std::integral_constant<bool, typeclass<T> == datatype::u>;
-template <typename T>
-using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>;
-
-template <typename T>
-struct typebits
-{
-    static_assert(is_number<deep_subtype<T>>::value, "");
-    constexpr static size_t bits  = sizeof(typename compound_type_traits<T>::subtype) * 8;
-    constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width;
-    using subtype                 = typename compound_type_traits<T>::subtype;
-};
-
-namespace fn
-{
-///@copybrief cometa::pass_through
-using pass_through = cometa::fn_pass_through;
-
-///@copybrief cometa::noop
-using noop = cometa::fn_noop;
-
-///@copybrief cometa::get_first
-using get_first = cometa::fn_get_first;
-
-///@copybrief cometa::get_second
-using get_second = cometa::fn_get_second;
-
-///@copybrief cometa::get_third
-using get_third = cometa::fn_get_third;
-
-///@copybrief cometa::returns
-template <typename T>
-using returns = cometa::fn_returns<T>;
-} // namespace fn
-
-template <typename T>
-using ftype =
-    typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>;
-template <typename T>
-using itype =
-    typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>;
-template <typename T>
-using utype =
-    typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>;
-
-template <typename T>
-using fsubtype = ftype<subtype<T>>;
-template <typename T>
-using isubtype = itype<subtype<T>>;
-template <typename T>
-using usubtype = utype<subtype<T>>;
-
-namespace internal
-{
-template <typename T>
-struct flt_type_impl
-{
-    using type = fbase;
-};
-
-template <>
-struct flt_type_impl<float>
-{
-    using type = float;
-};
-template <>
-struct flt_type_impl<double>
-{
-    using type = double;
-};
-} // namespace internal
-
-template <typename T>
-using flt_type = typename internal::flt_type_impl<T>::type;
-
-namespace internal
-{
-#ifdef CMT_COMPILER_CLANG
-#define builtin_addressof(x) __builtin_addressof(x)
-#else
-template <class T>
-inline T* builtin_addressof(T& arg)
-{
-    return reinterpret_cast<T*>(&const_cast<char&>(reinterpret_cast<const volatile char&>(arg)));
-}
-#endif
-
-#ifdef CMT_COMPILER_GNU
-CMT_INLINE f32 builtin_sqrt(f32 x) { return __builtin_sqrtf(x); }
-CMT_INLINE f64 builtin_sqrt(f64 x) { return __builtin_sqrt(x); }
-CMT_INLINE f80 builtin_sqrt(f80 x) { return __builtin_sqrtl(x); }
-CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size)
-{
-    __builtin_memcpy(dest, src, size);
-}
-CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); }
-#else
-
-CMT_INLINE f32 builtin_sqrt(f32 x) { return ::sqrtf(x); }
-CMT_INLINE f64 builtin_sqrt(f64 x) { return ::sqrt(x); }
-CMT_INLINE f80 builtin_sqrt(f80 x) { return ::sqrtl(x); }
-CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) { ::memcpy(dest, src, size); }
-CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { ::memset(dest, val, size); }
-
-#endif
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes")
-
-template <typename T, bool A>
-struct struct_with_alignment
-{
-    T value;
-    KFR_INTRIN void operator=(T value) { this->value = value; }
-};
-
-template <typename T>
-struct struct_with_alignment<T, false>
-{
-    T value;
-    KFR_INTRIN void operator=(T value) { this->value = value; }
-}
-#ifdef CMT_GNU_ATTRIBUTES
-__attribute__((__packed__, __may_alias__)) //
-#endif
-;
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-} // namespace internal
-
-/// @brief Fills a value with zeros
-template <typename T1>
-CMT_INLINE void zeroize(T1& value)
-{
-    internal::builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1));
-}
-
-/// @brief Used to determine the initial value for reduce functions
-template <typename T>
-struct initialvalue
-{
-};
-
-namespace internal
-{
-template <size_t width, typename Fn>
-CMT_INLINE void block_process_impl(size_t& i, size_t size, Fn&& fn)
-{
-    CMT_LOOP_NOUNROLL
-    for (; i < size / width * width; i += width)
-        fn(i, csize_t<width>());
-}
-} // namespace internal
-
-template <size_t... widths, typename Fn>
-CMT_INLINE void block_process(size_t size, csizes_t<widths...>, Fn&& fn)
-{
-    size_t i = 0;
-    swallow{ (internal::block_process_impl<widths>(i, size, std::forward<Fn>(fn)), 0)... };
-}
-
-template <typename T>
-struct is_simd_type
-    : std::integral_constant<
-          bool, std::is_same<T, float>::value || std::is_same<T, double>::value ||
-                    std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value ||
-                    std::is_same<T, short>::value || std::is_same<T, unsigned short>::value ||
-                    std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
-                    std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
-                    std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value>
-{
-};
-
-template <typename T, size_t N>
-struct vec_t
-{
-    static_assert(N > 0 && N <= 1024, "Invalid vector size");
-
-    static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
-
-    using value_type = T;
-    constexpr static size_t size() noexcept { return N; }
-    constexpr vec_t() noexcept = default;
-
-    using scalar_type = subtype<T>;
-    constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; }
-};
-
-constexpr size_t index_undefined = static_cast<size_t>(-1);
-
-struct czeros_t
-{
-};
-struct cones_t
-{
-};
-constexpr czeros_t czeros{};
-constexpr cones_t cones{};
-
-using caligned_t   = cbool_t<true>;
-using cunaligned_t = cbool_t<false>;
-
-constexpr caligned_t caligned{};
-constexpr cunaligned_t cunaligned{};
-
-#ifdef CMT_INTRINSICS_IS_CONSTEXPR
-#define KFR_I_CE constexpr
-#else
-#define KFR_I_CE
-#endif
-} // namespace kfr
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp
@@ -27,10 +27,10 @@
 
 #include "../cometa/array.hpp"
 
-#include "function.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/types.hpp"
 #include "memory.hpp"
-#include "read_write.hpp"
-#include "types.hpp"
 
 CMT_PRAGMA_MSVC(warning(push))
 CMT_PRAGMA_MSVC(warning(disable : 4324))
@@ -97,20 +97,14 @@ struct univector_base : input_expression, output_expression
     using output_expression::end_block;
 
     template <typename U, size_t N>
-    CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
+    KFR_MEM_INTRINSIC void operator()(coutput_t, size_t index, const vec<U, N>& value)
     {
         T* data = derived_cast<Class>(this)->data();
         write(ptr_cast<T>(data) + index, vec<T, N>(value));
     }
-    template <typename U, size_t N>
-    CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
-    {
-        const T* data = derived_cast<Class>(this)->data();
-        return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index));
-    }
 
     template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
-    CMT_INLINE Class& operator=(Input&& input)
+    KFR_MEM_INTRINSIC Class& operator=(Input&& input)
     {
         assign_expr(std::forward<Input>(input));
         return *derived_cast<Class>(this);
@@ -254,15 +248,15 @@ struct univector_base : input_expression, output_expression
 
 protected:
     template <typename Input>
-    CMT_INLINE void assign_expr(Input&& input)
+    KFR_MEM_INTRINSIC void assign_expr(Input&& input)
     {
         process(*derived_cast<Class>(this), std::forward<Input>(input));
     }
 
 private:
-    CMT_INLINE size_t get_size() const { return derived_cast<Class>(this)->size(); }
-    CMT_INLINE const T* get_data() const { return derived_cast<Class>(this)->data(); }
-    CMT_INLINE T* get_data() { return derived_cast<Class>(this)->data(); }
+    KFR_MEM_INTRINSIC size_t get_size() const { return derived_cast<Class>(this)->size(); }
+    KFR_MEM_INTRINSIC const T* get_data() const { return derived_cast<Class>(this)->data(); }
+    KFR_MEM_INTRINSIC T* get_data() { return derived_cast<Class>(this)->data(); }
 
     static void copy(T* dest, const T* src, size_t size)
     {
@@ -283,12 +277,12 @@ struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, S
         this->assign_expr(std::forward<Input>(input));
     }
     template <typename... Args>
-    constexpr univector(const T& x, const Args&... args) noexcept
+    constexpr univector(const T& x, const Args&... args) CMT_NOEXCEPT
         : std::array<T, Size>{ { x, static_cast<T>(args)... } }
     {
     }
 
-    constexpr univector() noexcept(noexcept(std::array<T, Size>())) = default;
+    constexpr univector() CMT_NOEXCEPT_SPEC(noexcept(std::array<T, Size>())) = default;
     constexpr univector(size_t, const T& value) { std::fill(this->begin(), this->end(), value); }
     constexpr static bool size_known   = true;
     constexpr static bool is_array     = true;
@@ -298,13 +292,13 @@ struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, S
     constexpr static bool is_pod       = kfr::is_pod<T>::value;
     using value_type                   = T;
 
-    value_type get(size_t index, value_type fallback_value) const noexcept
+    value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT
     {
         return index < this->size() ? this->operator[](index) : fallback_value;
     }
     using univector_base<T, univector>::operator=;
 
-    void resize(size_t) noexcept {}
+    void resize(size_t) CMT_NOEXCEPT {}
 };
 
 template <typename T>
@@ -334,7 +328,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T
     constexpr univector(univector<U, Tag>& other) : array_ref<T>(other.data(), other.size())
     {
     }
-    void resize(size_t) noexcept {}
+    void resize(size_t) CMT_NOEXCEPT {}
     constexpr static bool size_known   = false;
     constexpr static bool is_array     = false;
     constexpr static bool is_array_ref = true;
@@ -342,7 +336,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T
     constexpr static bool is_aligned   = false;
     using value_type                   = remove_const<T>;
 
-    value_type get(size_t index, value_type fallback_value) const noexcept
+    value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT
     {
         return index < this->size() ? this->operator[](index) : fallback_value;
     }
@@ -364,9 +358,11 @@ struct univector<T, tag_dynamic_vector> : std::vector<T, allocator<T>>,
         this->resize(input.size());
         this->assign_expr(std::forward<Input>(input));
     }
-    constexpr univector() noexcept(noexcept(std::vector<T, allocator<T>>())) = default;
+    constexpr univector() CMT_NOEXCEPT_SPEC(noexcept(std::vector<T, allocator<T>>())) = default;
     constexpr univector(const std::vector<T, allocator<T>>& other) : std::vector<T, allocator<T>>(other) {}
-    constexpr univector(std::vector<T, allocator<T>>&& other) : std::vector<T, allocator<T>>(std::move(other)) {}
+    constexpr univector(std::vector<T, allocator<T>>&& other) : std::vector<T, allocator<T>>(std::move(other))
+    {
+    }
     constexpr univector(const array_ref<T>& other) : std::vector<T, allocator<T>>(other.begin(), other.end())
     {
     }
@@ -378,19 +374,19 @@ struct univector<T, tag_dynamic_vector> : std::vector<T, allocator<T>>,
     constexpr univector(const std::vector<T, Allocator>&) = delete;
     template <typename Allocator>
     constexpr univector(std::vector<T, Allocator>&&) = delete;
-    constexpr static bool size_known   = false;
-    constexpr static bool is_array     = false;
-    constexpr static bool is_array_ref = false;
-    constexpr static bool is_vector    = true;
-    constexpr static bool is_aligned   = true;
-    using value_type                   = T;
+    constexpr static bool size_known                 = false;
+    constexpr static bool is_array                   = false;
+    constexpr static bool is_array_ref               = false;
+    constexpr static bool is_vector                  = true;
+    constexpr static bool is_aligned                 = true;
+    using value_type                                 = T;
 
-    value_type get(size_t index, value_type fallback_value) const noexcept
+    value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT
     {
         return index < this->size() ? this->operator[](index) : fallback_value;
     }
     template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
-    CMT_INLINE univector& operator=(Input&& input)
+    KFR_MEM_INTRINSIC univector& operator=(Input&& input)
     {
         if (input.size() != infinite_size)
             this->resize(input.size());
@@ -416,40 +412,18 @@ using univector3d = abstract_vector<abstract_vector<univector<T, Size3>, Size2>,
 
 /// @brief Creates univector from data and size
 template <typename T>
-CMT_INLINE univector_ref<T> make_univector(T* data, size_t size)
+KFR_INTRINSIC univector_ref<T> make_univector(T* data, size_t size)
 {
     return univector_ref<T>(data, size);
 }
 
 /// @brief Creates univector from data and size
 template <typename T>
-CMT_INLINE univector_ref<const T> make_univector(const T* data, size_t size)
+KFR_INTRINSIC univector_ref<const T> make_univector(const T* data, size_t size)
 {
     return univector_ref<const T>(data, size);
 }
 
-/// @brief Converts an expression to univector
-template <typename Expr, typename T = value_type_of<Expr>>
-CMT_INLINE univector<T> render(Expr&& expr)
-{
-    static_assert(!is_infinite<Expr>::value,
-                  "render: Can't process infinite expressions. Pass size as a second argument to render.");
-    univector<T> result;
-    result.resize(expr.size());
-    result = expr;
-    return result;
-}
-
-/// @brief Converts an expression to univector
-template <typename Expr, typename T = value_type_of<Expr>>
-CMT_INLINE univector<T> render(Expr&& expr, size_t size, size_t offset = 0)
-{
-    univector<T> result;
-    result.resize(size);
-    result = slice(expr, offset, size);
-    return result;
-}
-
 /// @brief Single producer single consumer lock-free ring buffer
 template <typename T>
 struct lockfree_ring_buffer
@@ -476,8 +450,8 @@ struct lockfree_ring_buffer
 
         const size_t real_tail  = cur_tail % buffer.size();
         const size_t first_size = std::min(buffer.size() - real_tail, size);
-        internal::builtin_memcpy(buffer.data() + real_tail, source, first_size * sizeof(T));
-        internal::builtin_memcpy(buffer.data(), source + first_size, (size - first_size) * sizeof(T));
+        builtin_memcpy(buffer.data() + real_tail, source, first_size * sizeof(T));
+        builtin_memcpy(buffer.data(), source + first_size, (size - first_size) * sizeof(T));
 
         std::atomic_thread_fence(std::memory_order_release);
 
@@ -500,8 +474,8 @@ struct lockfree_ring_buffer
 
         const size_t real_front = cur_front % buffer.size();
         const size_t first_size = std::min(buffer.size() - real_front, size);
-        internal::builtin_memcpy(dest, buffer.data() + real_front, first_size * sizeof(T));
-        internal::builtin_memcpy(dest + first_size, buffer.data(), (size - first_size) * sizeof(T));
+        builtin_memcpy(dest, buffer.data() + real_front, first_size * sizeof(T));
+        builtin_memcpy(dest + first_size, buffer.data(), (size - first_size) * sizeof(T));
 
         std::atomic_thread_fence(std::memory_order_release);
 
@@ -514,6 +488,47 @@ private:
     char cacheline_filler[64 - sizeof(std::atomic<size_t>)];
     std::atomic<size_t> tail;
 };
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, univector_tag Tag, typename U, size_t N>
+KFR_INTRINSIC vec<U, N> get_elements(const univector<T, Tag>& self, cinput_t, size_t index, vec_shape<U, N>)
+{
+    const T* data = self.data();
+    return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index));
+}
+
+/// @brief Converts an expression to univector
+template <typename Expr, typename T = value_type_of<Expr>>
+KFR_INTRINSIC univector<T> render(Expr&& expr)
+{
+    static_assert(!is_infinite<Expr>::value,
+                  "render: Can't process infinite expressions. Pass size as a second argument to render.");
+    univector<T> result;
+    result.resize(expr.size());
+    result = expr;
+    return result;
+}
+
+/// @brief Converts an expression to univector
+template <typename Expr, typename T = value_type_of<Expr>>
+KFR_INTRINSIC univector<T> render(Expr&& expr, size_t size, size_t offset = 0)
+{
+    univector<T> result;
+    result.resize(size);
+    result = slice(expr, offset, size);
+    return result;
+}
+
+/// @brief Converts an expression to univector
+template <typename Expr, size_t Size, typename T = value_type_of<Expr>>
+KFR_INTRINSIC univector<T, Size> render(Expr&& expr, csize_t<Size>)
+{
+    univector<T, Size> result;
+    result = expr;
+    return result;
+}
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
 
 CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -1,1171 +0,0 @@
-/** @addtogroup types
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "kfr.h"
-
-#include "constants.hpp"
-#include "platform.hpp"
-#include "types.hpp"
-
-namespace kfr
-{
-
-template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
-CMT_INLINE vec<T, Nout> low(const vec<T, N>& x);
-template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
-CMT_INLINE vec<T, Nout> high(const vec<T, N>& x);
-} // namespace kfr
-
-#ifdef CMT_COMPILER_CLANG
-#include "simd_clang.hpp"
-#else
-#include "simd_intrin.hpp"
-#ifdef CMT_ARCH_X86
-#include "simd_x86.hpp"
-#endif
-#endif
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked")
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4814))
-
-namespace kfr
-{
-
-template <typename T>
-using maskfor = typename T::mask_t;
-
-template <typename T, size_t N>
-struct mask : protected vec<T, N>
-{
-    using base                          = vec<T, N>;
-    KFR_I_CE mask() noexcept            = default;
-    KFR_I_CE mask(const mask&) noexcept = default;
-    KFR_I_CE mask& operator=(const mask&) noexcept = default;
-    using simd_type                                = typename base::simd_type;
-
-    simd_type operator*() const noexcept { return this->simd; }
-    simd_type& operator*() noexcept { return this->simd; }
-
-    KFR_I_CE mask(const base& v) noexcept
-    //: base(base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()))
-    {
-        this->simd = *base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec());
-    }
-
-    KFR_I_CE mask(const simd_type& simd) : base(simd) {}
-    template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
-    KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
-    {
-    }
-    template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
-    KFR_I_CE mask(const vec<U, N>& m) : base(base::frombits(m))
-    {
-    }
-    KFR_I_CE mask operator&(const mask& y) const noexcept
-    {
-        return static_cast<const base&>(*this) & static_cast<const base&>(y);
-    }
-    KFR_I_CE mask operator|(const mask& y) const noexcept
-    {
-        return static_cast<const base&>(*this) | static_cast<const base&>(y);
-    }
-    KFR_I_CE mask operator&&(const mask& y) const noexcept
-    {
-        return static_cast<const base&>(*this) & static_cast<const base&>(y);
-    }
-    KFR_I_CE mask operator||(const mask& y) const noexcept
-    {
-        return static_cast<const base&>(*this) | static_cast<const base&>(y);
-    }
-    KFR_I_CE mask operator^(const mask& y) const noexcept
-    {
-        return static_cast<const base&>(*this) ^ static_cast<const base&>(y);
-    }
-    KFR_I_CE mask operator~() const noexcept { return ~static_cast<const base&>(*this); }
-
-    bool operator[](size_t index) const noexcept;
-
-    constexpr base asvec() const noexcept { return reinterpret_cast<const base&>(*this); }
-};
-
-namespace internal
-{
-
-constexpr inline size_t scale_get_index(size_t counter, size_t groupsize, size_t index)
-{
-    return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index);
-}
-
-template <size_t counter, size_t groupsize, size_t... indices>
-constexpr inline size_t scale_get_index(csizes_t<indices...>)
-{
-    return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>()));
-}
-
-template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)>
-constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt) noexcept
-    -> csizes_t<scale_get_index<counter, groupsize>(ind)...>
-{
-    return {};
-}
-} // namespace internal
-
-template <size_t groupsize, size_t... indices>
-constexpr inline auto scale() noexcept
-{
-    return internal::scale_impl(csizes_t<indices...>(), csizeseq_t<sizeof...(indices) * groupsize>());
-}
-
-template <typename T, size_t Nin, size_t N>
-struct vec<vec<T, Nin>, N> : private vec<T, Nin * N>
-{
-    using base = vec<T, Nin * N>;
-
-    using value_type = vec<T, Nin>;
-    constexpr static size_t size() noexcept { return N; }
-
-    using scalar_type = T;
-    constexpr static size_t scalar_size() noexcept { return Nin * N; }
-
-    using simd_type = typename base::simd_type;
-
-    constexpr vec() noexcept           = default;
-    constexpr vec(const vec&) noexcept = default;
-    CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default;
-    constexpr vec(const simd_type& simd) noexcept : base(simd) {}
-    constexpr vec(czeros_t) noexcept : base(czeros) {}
-    constexpr vec(cones_t) noexcept : base(cones) {}
-
-    constexpr vec(const value_type& v) noexcept : base(v.shuffle(csizeseq_t<Nin * N>() % csize_t<Nin>())) {}
-
-    template <int = 0>
-    explicit constexpr vec(const vec<T, Nin * N>& v) noexcept : base(v)
-    {
-    }
-
-    // from list of vectors
-    template <typename... Us>
-    constexpr vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
-        : base(s0, s1, rest...)
-    {
-    }
-
-    template <typename U>
-    constexpr vec(const vec<vec<U, Nin>, N>& v) noexcept : base(static_cast<vec<T, Nin * N>>(v.flatten()))
-    {
-    }
-
-    template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
-    constexpr static vec frombits(const vec<U, M>& v) noexcept
-    {
-        return vec(base::frombits(v.flatten()));
-    }
-
-    // math / bitwise / comparison operators
-    constexpr friend vec operator+(const vec& x) noexcept { return x; }
-    constexpr friend vec operator-(const vec& x) noexcept { return base::operator-(x); }
-    constexpr friend vec operator~(const vec& x) noexcept { return base::operator~(x); }
-
-#define KFR_B(x) static_cast<const base&>(x)
-
-    constexpr friend vec operator+(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) + KFR_B(y)); }
-    constexpr friend vec operator-(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) - KFR_B(y)); }
-    constexpr friend vec operator*(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) * KFR_B(y)); }
-    constexpr friend vec operator/(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) / KFR_B(y)); }
-
-    constexpr friend vec operator<<(const vec& x, int shift) noexcept { return vec(KFR_B(x) << shift); }
-    constexpr friend vec operator>>(const vec& x, int shift) noexcept { return vec(KFR_B(x) >> shift); }
-    constexpr friend vec operator&(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) & KFR_B(y)); }
-    constexpr friend vec operator|(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) | KFR_B(y)); }
-    constexpr friend vec operator^(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) ^ KFR_B(y)); }
-
-#undef KFR_B
-
-    constexpr friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; }
-    constexpr friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; }
-    constexpr friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; }
-    constexpr friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; }
-
-    constexpr friend vec& operator<<=(vec& x, int shift) noexcept { return x = x << shift; }
-    constexpr friend vec& operator>>=(vec& x, int shift) noexcept { return x = x >> shift; }
-    constexpr friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; }
-    constexpr friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; }
-    constexpr friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; }
-
-    constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
-    constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
-    constexpr friend vec operator++(vec& x, int) noexcept
-    {
-        const vec z = x;
-        ++x;
-        return z;
-    }
-    constexpr friend vec operator--(vec& x, int) noexcept
-    {
-        const vec z = x;
-        --x;
-        return z;
-    }
-
-    // shuffle
-    template <size_t... indices>
-    constexpr vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
-    {
-        return *base::shuffle(scale<Nin, indices...>());
-    }
-    template <size_t... indices>
-    constexpr vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
-    {
-        return *base::shuffle(y, scale<Nin, indices...>());
-    }
-
-    // element access
-    struct element;
-    CMT_GNU_CONSTEXPR value_type operator[](size_t index) const noexcept { return get(index); }
-    CMT_GNU_CONSTEXPR element operator[](size_t index) noexcept { return { *this, index }; }
-
-    CMT_GNU_CONSTEXPR value_type get(size_t index) const noexcept
-    {
-        return reinterpret_cast<const value_type(&)[N]>(*this)[index];
-    }
-    CMT_GNU_CONSTEXPR void set(size_t index, const value_type& s) noexcept
-    {
-        reinterpret_cast<value_type(&)[N]>(*this)[index] = s;
-    }
-    template <size_t index>
-    CMT_GNU_CONSTEXPR value_type get(csize_t<index>) const noexcept
-    {
-        return static_cast<const base&>(*this).shuffle(csizeseq_t<Nin, index * Nin>());
-    }
-    template <size_t index>
-    CMT_GNU_CONSTEXPR void set(csize_t<index>, const value_type& s) noexcept
-    {
-        *this = vec(static_cast<const base&>(*this))
-                    .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * Nin>() &&
-                                                   csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) *
-                                                      N);
-    }
-    struct element
-    {
-        constexpr operator value_type() const noexcept { return v.get(index); }
-        element& operator=(const value_type& s) noexcept
-        {
-            v.set(index, s);
-            return *this;
-        }
-        vec& v;
-        size_t index;
-    };
-
-    template <bool aligned = false>
-    explicit constexpr vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept
-        : base(ptr_cast<T>(src), cbool_t<aligned>())
-    {
-    }
-    template <bool aligned = false>
-    const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept
-    {
-        base::write(ptr_cast<T>(dest), cbool_t<aligned>());
-        return *this;
-    }
-
-    const base& flatten() const noexcept { return *this; }
-    simd_type operator*() const noexcept { return base::operator*(); }
-    simd_type& operator*() noexcept { return base::operator*(); }
-};
-
-namespace internal
-{
-
-template <typename T>
-constexpr inline T maskbits(bool value)
-{
-    return value ? constants<T>::allones() : T();
-}
-
-template <typename T, size_t N>
-struct flt_type_impl<vec<T, N>>
-{
-    using type = vec<typename flt_type_impl<T>::type, N>;
-};
-
-template <typename T>
-struct is_vec_impl : std::false_type
-{
-};
-
-template <typename T, size_t N>
-struct is_vec_impl<vec<T, N>> : std::true_type
-{
-};
-} // namespace internal
-
-template <typename T>
-using is_vec = internal::is_vec_impl<T>;
-
-template <typename To, typename From, size_t N,
-          KFR_ENABLE_IF(std::is_same<subtype<From>, subtype<To>>::value),
-          size_t Nout = N* compound_type_traits<From>::width / compound_type_traits<To>::width>
-constexpr CMT_INLINE vec<To, Nout> compcast(const vec<From, N>& value) noexcept
-{
-    return vec<To, Nout>(value.flatten());
-}
-
-#ifdef KFR_ENABLE_SWIZZLE
-namespace swizzle
-{
-template <size_t>
-struct swiz
-{
-    constexpr swiz() {}
-};
-
-constexpr swiz<0> x{};
-constexpr swiz<1> y{};
-constexpr swiz<2> z{};
-constexpr swiz<3> w{};
-constexpr swiz<0> r{};
-constexpr swiz<1> g{};
-constexpr swiz<2> b{};
-constexpr swiz<3> a{};
-constexpr swiz<0> s{};
-constexpr swiz<1> t{};
-constexpr swiz<2> p{};
-constexpr swiz<3> q{};
-
-constexpr swiz<0> s0{};
-constexpr swiz<1> s1{};
-constexpr swiz<2> s2{};
-constexpr swiz<3> s3{};
-constexpr swiz<4> s4{};
-constexpr swiz<5> s5{};
-constexpr swiz<6> s6{};
-constexpr swiz<7> s7{};
-constexpr swiz<8> s8{};
-constexpr swiz<9> s9{};
-constexpr swiz<10> s10{};
-constexpr swiz<11> s11{};
-constexpr swiz<12> s12{};
-constexpr swiz<13> s13{};
-constexpr swiz<14> s14{};
-constexpr swiz<15> s15{};
-} // namespace swizzle
-#endif
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast")
-
-template <size_t N, typename T>
-constexpr CMT_INLINE vec<T, N> broadcast(T x)
-{
-    return x;
-}
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-
-namespace internal
-{
-
-template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>,
-          size_t Nout = N* compound_type_traits<To>::deep_width>
-constexpr CMT_INLINE vec<To, N> builtin_convertvector(const vec<From, N>& value) noexcept
-{
-    return vec<To, N>(value);
-}
-
-// scalar to scalar
-template <typename To, typename From>
-struct conversion
-{
-    static_assert(std::is_convertible<From, To>::value, "");
-    static To cast(const From& value) { return value; }
-};
-
-// vector to vector
-template <typename To, typename From, size_t N>
-struct conversion<vec<To, N>, vec<From, N>>
-{
-    static_assert(!is_compound<To>::value, "");
-    static_assert(!is_compound<From>::value, "");
-    static vec<To, N> cast(const vec<From, N>& value) { return builtin_convertvector<To>(value); }
-};
-
-// vector<vector> to vector<vector>
-template <typename To, typename From, size_t N1, size_t N2>
-struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, N1>, N2>>
-{
-    static_assert(!is_compound<To>::value, "");
-    static_assert(!is_compound<From>::value, "");
-    static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value)
-    {
-        return builtin_convertvector<vec<To, N1>>(value);
-    }
-};
-
-// scalar to vector
-template <typename To, typename From, size_t N>
-struct conversion<vec<To, N>, From>
-{
-    static_assert(std::is_convertible<From, To>::value, "");
-    static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
-};
-} // namespace internal
-
-template <typename T>
-constexpr size_t size_of() noexcept
-{
-    return sizeof(deep_subtype<T>) * compound_type_traits<T>::deep_width;
-}
-
-template <typename From, size_t N, typename Tsub = deep_subtype<From>,
-          size_t Nout = N* size_of<From>() / size_of<Tsub>()>
-constexpr CMT_INLINE vec<Tsub, Nout> flatten(const vec<From, N>& x) noexcept
-{
-    return x.flatten();
-}
-
-template <typename To, typename From,
-          typename Tout = typename compound_type_traits<From>::template deep_rebind<To>>
-constexpr CMT_INLINE Tout cast(const From& value) noexcept
-{
-    return static_cast<Tout>(value);
-}
-
-template <typename To, typename From>
-CMT_GNU_CONSTEXPR CMT_INLINE To bitcast(const From& value) noexcept
-{
-    static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types");
-    union {
-        From from;
-        To to;
-    } u{ value };
-    return u.to;
-}
-
-template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
-{
-    return vec<To, Nout>::frombits(value);
-}
-
-template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr CMT_INLINE To ubitcast(const From& value) noexcept
-{
-    return bitcast<To>(value);
-}
-
-template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr CMT_INLINE To ibitcast(const From& value) noexcept
-{
-    return bitcast<To>(value);
-}
-
-template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr CMT_INLINE To fbitcast(const From& value) noexcept
-{
-    return bitcast<To>(value);
-}
-
-template <typename From, size_t N, typename To = utype<From>,
-          size_t Nout = size_of<From>() * N / size_of<To>()>
-constexpr CMT_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept
-{
-    return vec<To, Nout>::frombits(value);
-}
-
-template <typename From, size_t N, typename To = itype<From>,
-          size_t Nout = size_of<From>() * N / size_of<To>()>
-constexpr CMT_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept
-{
-    return vec<To, Nout>::frombits(value);
-}
-
-template <typename From, size_t N, typename To = ftype<From>,
-          size_t Nout = size_of<From>() * N / size_of<To>()>
-constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
-{
-    return vec<To, Nout>::frombits(value);
-}
-
-template <typename T, size_t N>
-inline bool mask<T, N>::operator[](size_t index) const noexcept
-{
-    return ibitcast(base::operator[](index)) < 0;
-}
-
-constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
-
-namespace internal
-{
-template <size_t start = 0, size_t stride = 1>
-struct shuffle_index
-{
-    constexpr CMT_INLINE size_t operator()(size_t index) const { return start + index * stride; }
-};
-
-template <size_t count, size_t start = 0, size_t stride = 1>
-struct shuffle_index_wrap
-{
-    constexpr inline size_t operator()(size_t index) const { return (start + index * stride) % count; }
-};
-} // namespace internal
-
-template <size_t count, typename T, size_t N, size_t Nout = N* count>
-CMT_INLINE vec<T, Nout> repeat(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<Nout>() % csize_t<N>());
-}
-KFR_FN(repeat)
-
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)>
-CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<Nout>() % csize_t<N>());
-}
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout == N)>
-constexpr CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x)
-{
-    return x;
-}
-KFR_FN(resize)
-
-template <typename T, size_t N>
-struct pkd_vec
-{
-    constexpr pkd_vec() noexcept {}
-    pkd_vec(const vec<T, N>& value) noexcept { value.write(v); }
-    template <typename... Ts>
-    constexpr pkd_vec(Ts... init) noexcept : v{ static_cast<T>(init)... }
-    {
-        static_assert(N <= sizeof...(Ts), "Too few initializers for pkd_vec");
-    }
-
-private:
-    T v[N];
-    friend struct vec<T, N>;
-}
-#ifdef CMT_GNU_ATTRIBUTES
-__attribute__((packed))
-#endif
-;
-
-namespace internal
-{
-
-template <size_t, typename T>
-constexpr CMT_INLINE T make_vector_get_n()
-{
-    return T();
-}
-template <size_t index, typename T, typename... Args>
-constexpr CMT_INLINE T make_vector_get_n(const T& arg, const Args&... args)
-{
-    return index == 0 ? arg : make_vector_get_n<index - 1, T>(args...);
-}
-
-template <typename T, typename... Args, size_t... indices, size_t N = sizeof...(Args)>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, const Args&... args)
-{
-    const T list[] = { static_cast<T>(args)... };
-    return vec<T, N>(list[indices]...);
-}
-} // namespace internal
-
-/// Create vector from scalar values
-/// @code
-/// CHECK( make_vector( 1, 2, 3, 4 ) == i32x4{1, 2, 3, 4} );
-/// @endcode
-template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
-          typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>
-constexpr CMT_INLINE vec<SubType, N> make_vector(const Arg& x, const Args&... rest)
-{
-    return internal::make_vector_impl<SubType>(cvalseq_t<size_t, N>(), static_cast<SubType>(x),
-                                               static_cast<SubType>(rest)...);
-}
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> make_vector(const vec<T, N>& x)
-{
-    return x;
-}
-template <typename T, T... Values, size_t N = sizeof...(Values)>
-constexpr CMT_INLINE vec<T, N> make_vector(cvals_t<T, Values...>)
-{
-    return make_vector<T>(Values...);
-}
-KFR_FN(make_vector)
-
-template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
-          typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>,
-          KFR_ENABLE_IF(is_number<subtype<SubType>>::value)>
-constexpr CMT_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest)
-{
-    return internal::make_vector_impl<SubType>(csizeseq_t<N * widthof<SubType>()>(), static_cast<SubType>(x),
-                                               static_cast<SubType>(rest)...);
-}
-KFR_FN(pack)
-
-namespace operators
-{
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator+(const vec<T1, N>& x, const T2& y)
-{
-    return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator-(const vec<T1, N>& x, const T2& y)
-{
-    return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator*(const vec<T1, N>& x, const T2& y)
-{
-    return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator/(const vec<T1, N>& x, const T2& y)
-{
-    return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y);
-}
-
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator+(const T1& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator-(const T1& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator*(const T1& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator/(const T1& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y);
-}
-
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator+(const vec<T1, N>& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator-(const vec<T1, N>& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator*(const vec<T1, N>& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator/(const vec<T1, N>& x, const vec<T2, N>& y)
-{
-    return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y);
-}
-
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator&&(const T1& x, const vec<T1, N>& y)
-{
-    return static_cast<vec<T1, N>>(x) && y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator||(const T1& x, const vec<T1, N>& y)
-{
-    return static_cast<vec<T1, N>>(x) || y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator&(const T1& x, const vec<T1, N>& y)
-{
-    return static_cast<vec<T1, N>>(x) & y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator|(const T1& x, const vec<T1, N>& y)
-{
-    return static_cast<vec<T1, N>>(x) | y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator^(const T1& x, const vec<T1, N>& y)
-{
-    return static_cast<vec<T1, N>>(x) ^ y;
-}
-} // namespace operators
-
-using namespace operators;
-
-template <typename T, size_t N1, size_t N2 = N1>
-using mat = vec<vec<T, N1>, N2>;
-
-namespace internal
-{
-
-template <size_t start, size_t count>
-struct shuffle_index_extend
-{
-    constexpr CMT_INLINE size_t operator()(size_t index) const
-    {
-        return index >= start && index < start + count ? index - start : index_undefined;
-    }
-};
-
-template <typename T, size_t Nout, size_t N1, size_t... indices>
-constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>)
-{
-    return make_vector(maskbits<T>(indices < N1)...);
-}
-template <typename T, size_t Nout, size_t N1>
-constexpr vec<T, Nout> partial_mask()
-{
-    return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>());
-}
-} // namespace internal
-
-template <typename T>
-using optvec = vec<T, platform<T>::vector_capacity / 4>;
-
-using f32x1  = vec<f32, 1>;
-using f32x2  = vec<f32, 2>;
-using f32x3  = vec<f32, 3>;
-using f32x4  = vec<f32, 4>;
-using f32x8  = vec<f32, 8>;
-using f32x16 = vec<f32, 16>;
-using f32x32 = vec<f32, 32>;
-using f32x64 = vec<f32, 64>;
-using f64x1  = vec<f64, 1>;
-using f64x2  = vec<f64, 2>;
-using f64x3  = vec<f64, 3>;
-using f64x4  = vec<f64, 4>;
-using f64x8  = vec<f64, 8>;
-using f64x16 = vec<f64, 16>;
-using f64x32 = vec<f64, 32>;
-using f64x64 = vec<f64, 64>;
-using i8x1   = vec<i8, 1>;
-using i8x2   = vec<i8, 2>;
-using i8x3   = vec<i8, 3>;
-using i8x4   = vec<i8, 4>;
-using i8x8   = vec<i8, 8>;
-using i8x16  = vec<i8, 16>;
-using i8x32  = vec<i8, 32>;
-using i8x64  = vec<i8, 64>;
-using i16x1  = vec<i16, 1>;
-using i16x2  = vec<i16, 2>;
-using i16x3  = vec<i16, 3>;
-using i16x4  = vec<i16, 4>;
-using i16x8  = vec<i16, 8>;
-using i16x16 = vec<i16, 16>;
-using i16x32 = vec<i16, 32>;
-using i16x64 = vec<i16, 64>;
-using i32x1  = vec<i32, 1>;
-using i32x2  = vec<i32, 2>;
-using i32x3  = vec<i32, 3>;
-using i32x4  = vec<i32, 4>;
-using i32x8  = vec<i32, 8>;
-using i32x16 = vec<i32, 16>;
-using i32x32 = vec<i32, 32>;
-using i32x64 = vec<i32, 64>;
-using i64x1  = vec<i64, 1>;
-using i64x2  = vec<i64, 2>;
-using i64x3  = vec<i64, 3>;
-using i64x4  = vec<i64, 4>;
-using i64x8  = vec<i64, 8>;
-using i64x16 = vec<i64, 16>;
-using i64x32 = vec<i64, 32>;
-using i64x64 = vec<i64, 64>;
-using u8x1   = vec<u8, 1>;
-using u8x2   = vec<u8, 2>;
-using u8x3   = vec<u8, 3>;
-using u8x4   = vec<u8, 4>;
-using u8x8   = vec<u8, 8>;
-using u8x16  = vec<u8, 16>;
-using u8x32  = vec<u8, 32>;
-using u8x64  = vec<u8, 64>;
-using u16x1  = vec<u16, 1>;
-using u16x2  = vec<u16, 2>;
-using u16x3  = vec<u16, 3>;
-using u16x4  = vec<u16, 4>;
-using u16x8  = vec<u16, 8>;
-using u16x16 = vec<u16, 16>;
-using u16x32 = vec<u16, 32>;
-using u16x64 = vec<u16, 64>;
-using u32x1  = vec<u32, 1>;
-using u32x2  = vec<u32, 2>;
-using u32x3  = vec<u32, 3>;
-using u32x4  = vec<u32, 4>;
-using u32x8  = vec<u32, 8>;
-using u32x16 = vec<u32, 16>;
-using u32x32 = vec<u32, 32>;
-using u32x64 = vec<u32, 64>;
-using u64x1  = vec<u64, 1>;
-using u64x2  = vec<u64, 2>;
-using u64x3  = vec<u64, 3>;
-using u64x4  = vec<u64, 4>;
-using u64x8  = vec<u64, 8>;
-using u64x16 = vec<u64, 16>;
-using u64x32 = vec<u64, 32>;
-using u64x64 = vec<u64, 64>;
-
-using u8x2x2  = vec<vec<u8, 2>, 2>;
-using i8x2x2  = vec<vec<i8, 2>, 2>;
-using u16x2x2 = vec<vec<u16, 2>, 2>;
-using i16x2x2 = vec<vec<i16, 2>, 2>;
-using u32x2x2 = vec<vec<u32, 2>, 2>;
-using i32x2x2 = vec<vec<i32, 2>, 2>;
-using u64x2x2 = vec<vec<u64, 2>, 2>;
-using i64x2x2 = vec<vec<i64, 2>, 2>;
-using f32x2x2 = vec<vec<f32, 2>, 2>;
-using f64x2x2 = vec<vec<f64, 2>, 2>;
-
-using u8x4x4  = vec<vec<u8, 4>, 4>;
-using i8x4x4  = vec<vec<i8, 4>, 4>;
-using u16x4x4 = vec<vec<u16, 4>, 4>;
-using i16x4x4 = vec<vec<i16, 4>, 4>;
-using u32x4x4 = vec<vec<u32, 4>, 4>;
-using i32x4x4 = vec<vec<i32, 4>, 4>;
-using u64x4x4 = vec<vec<u64, 4>, 4>;
-using i64x4x4 = vec<vec<i64, 4>, 4>;
-using f32x4x4 = vec<vec<f32, 4>, 4>;
-using f64x4x4 = vec<vec<f64, 4>, 4>;
-
-namespace glsl_names
-{
-using vec2  = f32x2;
-using vec3  = f32x3;
-using vec4  = f32x4;
-using dvec2 = f64x2;
-using dvec3 = f64x3;
-using dvec4 = f64x4;
-using ivec2 = i32x2;
-using ivec3 = i32x3;
-using ivec4 = i32x4;
-using uvec2 = u32x2;
-using uvec3 = u32x3;
-using uvec4 = u32x4;
-} // namespace glsl_names
-namespace opencl_names
-{
-using char2   = i8x2;
-using char3   = i8x3;
-using char4   = i8x4;
-using char8   = i8x8;
-using char16  = i8x16;
-using uchar2  = u8x2;
-using uchar3  = u8x3;
-using uchar4  = u8x4;
-using uchar8  = u8x8;
-using uchar16 = u8x16;
-
-using short2   = i16x2;
-using short3   = i16x3;
-using short4   = i16x4;
-using short8   = i16x8;
-using short16  = i16x16;
-using ushort2  = u16x2;
-using ushort3  = u16x3;
-using ushort4  = u16x4;
-using ushort8  = u16x8;
-using ushort16 = u16x16;
-
-using int2   = i32x2;
-using int3   = i32x3;
-using int4   = i32x4;
-using int8   = i32x8;
-using int16  = i32x16;
-using uint2  = u32x2;
-using uint3  = u32x3;
-using uint4  = u32x4;
-using uint8  = u32x8;
-using uint16 = u32x16;
-
-using long2   = i64x2;
-using long3   = i64x3;
-using long4   = i64x4;
-using long8   = i64x8;
-using long16  = i64x16;
-using ulong2  = u64x2;
-using ulong3  = u64x3;
-using ulong4  = u64x4;
-using ulong8  = u64x8;
-using ulong16 = u64x16;
-
-using float2  = f32x2;
-using float3  = f32x3;
-using float4  = f32x4;
-using float8  = f32x8;
-using float16 = f32x16;
-
-using double2  = f64x2;
-using double3  = f64x3;
-using double4  = f64x4;
-using double8  = f64x8;
-using double16 = f64x16;
-} // namespace opencl_names
-
-namespace internal
-{
-
-template <typename T, size_t N>
-struct vec_type
-{
-    using type = vec<T, N>;
-};
-
-template <typename T, size_t Nmax>
-struct maxvec
-{
-    constexpr static size_t size = Nmax;
-    vec<T, size> vmax;
-    maxvec(T initial) : vmax(initial) {}
-    template <int N>
-    vec<T, N>& v()
-    {
-        static_assert(N <= size, "N <= size");
-        return reinterpret_cast<vec<T, N>&>(*this);
-    }
-    template <int N>
-    const vec<T, N>& v() const
-    {
-        static_assert(N <= size, "N <= size");
-        return reinterpret_cast<const vec<T, N>&>(*this);
-    }
-};
-
-template <size_t Index, typename T, size_t N, typename Fn, typename... Args,
-          typename Tout = result_of<Fn(subtype<decay<Args>>...)>>
-constexpr CMT_INLINE Tout applyfn_helper(Fn&& fn, Args&&... args)
-{
-    return fn(args[Index]...);
-}
-
-template <typename T, size_t N, typename Fn, typename... Args,
-          typename Tout = result_of<Fn(subtype<decay<Args>>...)>, size_t... Indices>
-constexpr CMT_INLINE vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args)
-{
-    return make_vector(applyfn_helper<Indices, T, N>(std::forward<Fn>(fn), std::forward<Args>(args)...)...);
-}
-template <typename T, size_t N, typename Fn, size_t... Indices>
-constexpr CMT_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
-{
-    return make_vector(((void)Indices, void(), fn())...);
-}
-} // namespace internal
-
-template <typename T, size_t N, typename Fn, typename... Args,
-          typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
-constexpr CMT_INLINE vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args)
-{
-    return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>(), arg,
-                                        std::forward<Args>(args)...);
-}
-
-template <size_t N, typename Fn, typename T = result_of<Fn()>>
-constexpr CMT_INLINE vec<T, N> apply(Fn&& fn)
-{
-    return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>());
-}
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_SIMD
-CMT_INLINE f32x4 tovec(__m128 x) { return f32x4(x); }
-CMT_INLINE f64x2 tovec(__m128d x) { return f64x2(x); }
-#endif
-
-template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)>
-constexpr CMT_INLINE mask<T, Nout> make_mask(bool arg, Args... args)
-{
-    return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...);
-}
-KFR_FN(make_mask)
-
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector()
-{
-    return vec<T, N>(czeros);
-}
-
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector(vec_t<T, N>)
-{
-    return vec<T, N>(czeros);
-}
-KFR_FN(zerovector)
-
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector()
-{
-    return vec<T, N>(cones);
-}
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>)
-{
-    return vec<T, N>(cones);
-}
-KFR_FN(allonesvector)
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> undefinedvector()
-{
-    return vec<T, N>{};
-}
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> undefinedvector(vec_t<T, N>)
-{
-    return undefinedvector<T, N>();
-}
-KFR_FN(undefinedvector)
-
-template <typename T, size_t N, size_t Nout /*= prev_poweroftwo(N - 1)*/>
-CMT_INLINE vec<T, Nout> low(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<Nout>());
-}
-
-template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
-CMT_INLINE vec_t<T, Nout> low(vec_t<T, N>)
-{
-    return {};
-}
-
-template <typename T, size_t N, size_t Nout /*= N - prev_poweroftwo(N - 1)*/>
-CMT_INLINE vec<T, Nout> high(const vec<T, N>& x)
-{
-    return x.shuffle(csizeseq_t<Nout, prev_poweroftwo(N - 1)>());
-}
-
-template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
-CMT_INLINE vec_t<T, Nout> high(vec_t<T, N>)
-{
-    return {};
-}
-KFR_FN(low)
-KFR_FN(high)
-} // namespace kfr
-
-namespace cometa
-{
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::vec_t<T, N>>
-{
-    constexpr static size_t width      = N;
-    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
-    using subtype                      = T;
-    using deep_subtype                 = cometa::deep_subtype<T>;
-    constexpr static bool is_scalar    = false;
-    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
-
-    template <typename U>
-    using rebind = kfr::vec_t<U, N>;
-    template <typename U>
-    using deep_rebind = kfr::vec_t<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-};
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::vec<T, N>>
-{
-    using subtype                      = T;
-    using deep_subtype                 = cometa::deep_subtype<T>;
-    constexpr static size_t width      = N;
-    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
-    constexpr static bool is_scalar    = false;
-    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
-    template <typename U>
-    using rebind = kfr::vec<U, N>;
-    template <typename U>
-    using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-
-    CMT_INLINE static constexpr subtype at(const kfr::vec<T, N>& value, size_t index) { return value[index]; }
-};
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::mask<T, N>>
-{
-    using subtype                      = T;
-    using deep_subtype                 = cometa::deep_subtype<T>;
-    constexpr static size_t width      = N;
-    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
-    constexpr static bool is_scalar    = false;
-    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
-    template <typename U>
-    using rebind = kfr::mask<U, N>;
-    template <typename U>
-    using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-
-    CMT_INLINE static constexpr subtype at(const kfr::mask<T, N>& value, size_t index)
-    {
-        return value[index];
-    }
-};
-} // namespace cometa
-
-namespace std
-{
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<T1, N>, kfr::vec<T2, N>>
-{
-    using type = kfr::vec<typename common_type<T1, T2>::type, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<T1, N>, T2>
-{
-    using type = kfr::vec<typename common_type<T1, T2>::type, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<T1, kfr::vec<T2, N>>
-{
-    using type = kfr::vec<typename common_type<T1, T2>::type, N>;
-};
-template <typename T1, typename T2, size_t N1, size_t N2>
-struct common_type<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>>
-{
-    using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>;
-};
-template <typename T1, typename T2, size_t N1, size_t N2>
-struct common_type<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>>
-{
-    using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>;
-};
-
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
-{
-    using type = kfr::mask<typename common_type<T1, T2>::type, N>;
-};
-} // namespace std
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -16,8 +16,10 @@ extern char* gets(char* __s);
 #ifdef CMT_ARCH_X86
 #if defined(_M_X64) || defined(__x86_64__)
 #define CMT_ARCH_X64 1
+#define CMT_ARCH_BITNESS_NAME "64-bit"
 #else
 #define CMT_ARCH_X32 1
+#define CMT_ARCH_BITNESS_NAME "32-bit"
 #endif
 
 #ifndef CMT_FORCE_GENERIC_CPU
@@ -133,8 +135,10 @@ extern char* gets(char* __s);
 
 #if defined(__aarch64__)
 #define CMT_ARCH_X64 1
+#define CMT_ARCH_BITNESS_NAME "64-bit"
 #else
 #define CMT_ARCH_X32 1
+#define CMT_ARCH_BITNESS_NAME "32-bit"
 #endif
 
 #ifdef __ARM_NEON__
@@ -146,22 +150,22 @@ extern char* gets(char* __s);
 #else
 #define CMT_ARCH_NEON 1
 #define CMT_ARCH_NAME neon
-#define KFR_NO_NATIVE_F64 1
+#define CMT_NO_NATIVE_F64 1
 #endif
 #endif
 
 #endif
 
 #ifndef CMT_ARCH_NAME
-#define CMT_ARCH_NAME common
+#define CMT_ARCH_NAME generic
 #endif
 
-#ifndef KFR_NO_NATIVE_F64
-#define KFR_NATIVE_F64 1
+#ifndef CMT_NO_NATIVE_F64
+#define CMT_NATIVE_F64 1
 #endif
 
-#ifndef KFR_NO_NATIVE_I64
-#define KFR_NATIVE_I64 1
+#ifndef CMT_NO_NATIVE_I64
+#define CMT_NATIVE_I64 1
 #endif
 
 #define CMT_STRINGIFY2(x) #x
@@ -250,28 +254,29 @@ extern char* gets(char* __s);
 #define CMT_ALWAYS_INLINE
 #endif
 #define CMT_INLINE __inline__ CMT_ALWAYS_INLINE
-#define CMT_INTRIN CMT_INLINE CMT_NODEBUG
 #define CMT_INLINE_MEMBER CMT_ALWAYS_INLINE
 #define CMT_INLINE_LAMBDA CMT_INLINE_MEMBER
 #define CMT_NOINLINE __attribute__((__noinline__))
 #define CMT_FLATTEN __attribute__((__flatten__))
 #define CMT_RESTRICT __restrict__
-#define CMT_FUNC __inline__
 
 #elif defined(CMT_MSVC_ATTRIBUTES)
 
+#define CMT_ALWAYS_INLINE __forceinline
 #define CMT_NODEBUG
 #define CMT_INLINE /*inline*/ __forceinline
-#define CMT_INTRIN CMT_INLINE CMT_NODEBUG
 #define CMT_INLINE_MEMBER __forceinline
 #define CMT_INLINE_LAMBDA
 #define CMT_NOINLINE __declspec(noinline)
 #define CMT_FLATTEN
 #define CMT_RESTRICT __restrict
-#define CMT_FUNC inline
 
 #endif
 
+#define CMT_INTRINSIC CMT_INLINE CMT_NODEBUG
+#define CMT_MEM_INTRINSIC CMT_INLINE CMT_NODEBUG
+#define CMT_FUNCTION inline
+
 #if defined _MSC_VER && _MSC_VER >= 1900 &&                                                                  \
     (!defined(__clang__) ||                                                                                  \
      (defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 9))))
@@ -386,8 +391,10 @@ extern char* gets(char* __s);
 
 #if CMT_HAS_NOEXCEPT
 #define CMT_NOEXCEPT noexcept
+#define CMT_NOEXCEPT_SPEC(...) noexcept(__VA_ARGS__)
 #else
 #define CMT_NOEXCEPT
+#define CMT_NOEXCEPT_SPEC(...)
 #endif
 
 #if CMT_COMPILER_GNU && !defined(__EXCEPTIONS)
@@ -491,16 +498,55 @@ extern char* gets(char* __s);
 #define CMT_OS_NAME "unknown"
 #endif
 
-#if defined CMT_COMPILER_CLANG
+#if defined CMT_COMPILER_INTEL
 #if defined _MSC_VER
-#define CMT_COMPIER_NAME "clang-msvc"
+#define CMT_COMPILER_NAME "intel-msvc"
+#define CMT_COMPILER_FULL_NAME                                                                               \
+    "clang-msvc-" CMT_STRINGIFY(__ICL) "." CMT_STRINGIFY(__INTEL_COMPILER_UPDATE) "." CMT_STRINGIFY(         \
+        __INTEL_COMPILER_BUILD_DATE)
+#else
+#define CMT_COMPILER_NAME "intel"
+#ifdef __INTEL_CLANG_COMPILER
+#define CMT_COMPILER_INTEL_SPEC "-clang"
+#ifdef __INTEL_LLVM_COMPILER
+#define CMT_COMPILER_INTEL_SPEC "-clang-llvm"
+#endif
 #else
-#define CMT_COMPIER_NAME "clang"
+#ifdef __INTEL_LLVM_COMPILER
+#define CMT_COMPILER_INTEL_SPEC "-llvm"
+#else
+#define CMT_COMPILER_INTEL_SPEC ""
+#endif
+#endif
+#define CMT_COMPILER_FULL_NAME                                                                               \
+    "intel-" CMT_STRINGIFY(__INTEL_COMPILER) CMT_COMPILER_INTEL_SPEC                                         \
+        "." CMT_STRINGIFY(__INTEL_COMPILER_UPDATE) "." CMT_STRINGIFY(__INTEL_COMPILER_BUILD_DATE)
+#endif
+#elif defined CMT_COMPILER_CLANG
+#if defined _MSC_VER
+#define CMT_COMPILER_NAME "clang-msvc"
+#define CMT_COMPILER_FULL_NAME                                                                               \
+    "clang-msvc-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY(       \
+        __clang_patchlevel__)
+#else
+#define CMT_COMPILER_NAME "clang"
+#define CMT_COMPILER_FULL_NAME                                                                               \
+    "clang-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY(            \
+        __clang_patchlevel__)
 #endif
 #elif defined CMT_COMPILER_GCC
-#define CMT_COMPIER_NAME "gcc"
+#define CMT_COMPILER_NAME "gcc"
+#define CMT_COMPILER_FULL_NAME                                                                               \
+    "gcc-" CMT_STRINGIFY(__GNUC__) "." CMT_STRINGIFY(__GNUC_MINOR__) "." CMT_STRINGIFY(__GNUC_PATCHLEVEL__)
 #elif defined CMT_COMPILER_MSVC
-#define CMT_COMPIER_NAME "msvc"
+#define CMT_COMPILER_NAME "msvc"
+#define CMT_COMPILER_FULL_NAME "msvc-" CMT_STRINGIFY(_MSC_VER) "." CMT_STRINGIFY(_MSC_FULL_VER)
 #else
-#define CMT_COMPIER_NAME "unknown"
+#define CMT_COMPILER_NAME "unknown"
+#define CMT_COMPILER_FULL_NAME "unknown"
 #endif
+
+#define CMT_CONCAT(a, b) a##b
+
+#define CMT_NARGS2(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, ...) _10
+#define CMT_NARGS(...) CMT_NARGS2(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp
@@ -8,11 +8,15 @@
 #include <cstdint>
 #include <cstdlib>
 #include <limits>
+#include <random>
 #include <type_traits>
 #include <utility>
 
 CMT_PRAGMA_GNU(GCC diagnostic push)
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunknown-warning-option")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized")
 
 CMT_PRAGMA_MSVC(warning(push))
 CMT_PRAGMA_MSVC(warning(disable : 4814))
@@ -26,13 +30,13 @@ using std::size_t;
 #if __cplusplus >= 201103L || CMT_MSC_VER >= 1900 || CMT_HAS_FEATURE(cxx_constexpr)
 
 template <typename T, size_t N>
-constexpr inline static size_t arraysize(const T (&)[N]) noexcept
+constexpr inline static size_t arraysize(const T (&)[N]) CMT_NOEXCEPT
 {
     return N;
 }
 
 template <typename T, size_t N>
-constexpr inline static std::integral_constant<size_t, N> carraysize(const T (&)[N]) noexcept
+constexpr inline static std::integral_constant<size_t, N> carraysize(const T (&)[N]) CMT_NOEXCEPT
 {
     return {};
 }
@@ -173,9 +177,6 @@ using is_template_arg = std::integral_constant<bool, std::is_integral<T>::value 
 template <typename T>
 using decay = typename std::decay<T>::type;
 
-template <typename... T>
-using decay_common = decay<common_type<T...>>;
-
 template <typename T1, typename T2 = void, typename... Ts>
 constexpr size_t typeindex()
 {
@@ -253,7 +254,7 @@ namespace ops
 {
 struct empty
 {
-    constexpr empty() noexcept {}
+    constexpr empty() CMT_NOEXCEPT {}
 };
 } // namespace ops
 
@@ -261,9 +262,9 @@ template <typename T, T val>
 struct cval_t : ops::empty
 {
     constexpr static T value = val;
-    constexpr cval_t() noexcept {}
-    constexpr cval_t(const cval_t&) noexcept = default;
-    constexpr cval_t(cval_t&&) noexcept      = default;
+    constexpr cval_t() CMT_NOEXCEPT {}
+    constexpr cval_t(const cval_t&) CMT_NOEXCEPT = default;
+    constexpr cval_t(cval_t&&) CMT_NOEXCEPT      = default;
     typedef T value_type;
     typedef cval_t type;
     constexpr operator value_type() const { return value; }
@@ -386,6 +387,8 @@ struct get_nth_type<index>
 template <typename T, T... values>
 struct cvals_t : ops::empty
 {
+    constexpr cvals_t() CMT_NOEXCEPT = default;
+
     using type = cvals_t<T, values...>;
     constexpr static size_t size() { return sizeof...(values); }
     template <size_t index>
@@ -413,12 +416,13 @@ struct cvals_t : ops::empty
     constexpr cvals_t<T, details::get_nth_e<indices, type>::value...> operator[](
         cvals_t<size_t, indices...>) const
     {
+        //        static_assert(sizeof(T)==0, "+++++++++++++++++++++++++++++");
         return {};
     }
 
     // MSVC requires static_cast<T> here:
     template <typename Fn>
-    constexpr auto map(Fn&& fn) -> cvals_t<T, static_cast<T>(Fn()(values))...>
+    constexpr auto map(Fn&&) const -> cvals_t<T, static_cast<T>(Fn()(values))...>
     {
         return {};
     }
@@ -487,6 +491,10 @@ constexpr inline T cprod(cvals_t<T, first, rest...>)
 template <typename T>
 struct ctype_t
 {
+#ifdef CMT_COMPILER_INTEL
+    constexpr ctype_t() CMT_NOEXCEPT               = default;
+    constexpr ctype_t(const ctype_t&) CMT_NOEXCEPT = default;
+#endif
     using type = T;
 };
 
@@ -510,9 +518,15 @@ struct ctypes_t
 
 namespace details
 {
-template <typename T1, typename T2>
+template <typename T1, typename... Ts>
 struct concat_impl;
 
+template <typename T>
+struct concat_impl<T>
+{
+    using type = T;
+};
+
 template <typename T, T... values1, T... values2>
 struct concat_impl<cvals_t<T, values1...>, cvals_t<T, values2...>>
 {
@@ -523,12 +537,19 @@ struct concat_impl<ctypes_t<types1...>, ctypes_t<types2...>>
 {
     using type = ctypes_t<types1..., types2...>;
 };
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+struct concat_impl<T1, T2, T3, Ts...>
+{
+    using type = typename concat_impl<typename concat_impl<T1, T2>::type, T3, Ts...>::type;
+};
+
 } // namespace details
-template <typename T1, typename T2>
-using concat_lists = typename details::concat_impl<T1, T2>::type;
+template <typename T1, typename... Ts>
+using concat_lists = typename details::concat_impl<decay<T1>, decay<Ts>...>::type;
 
-template <typename T1, typename T2>
-constexpr inline concat_lists<T1, T2> cconcat(T1, T2)
+template <typename T1, typename... Ts>
+constexpr inline concat_lists<T1, Ts...> cconcat(T1, Ts...)
 {
     return {};
 }
@@ -584,7 +605,7 @@ template <typename Fn>
 using function_result = typename details::function_arguments_impl<decltype(&Fn::operator())>::result;
 
 template <typename T1, typename T2>
-using cfilter_t = typename details::filter_impl<T1, T2>::type;
+using cfilter_t = typename details::filter_impl<decay<T1>, decay<T2>>::type;
 
 template <typename T, T... vals, bool... flags,
           typename Ret = cfilter_t<cvals_t<T, vals...>, cvals_t<bool, flags...>>>
@@ -659,15 +680,13 @@ CMT_BIN_OP(^)
 
 namespace details
 {
-template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep>
-struct cvalseq_impl;
-
-template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep>
-using cgen_seq = typename cvalseq_impl<T, Nsize, Nstart, Nstep>::type;
 
 template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep>
-struct cvalseq_impl : concat_impl<cgen_seq<T, Nsize / 2, Nstart, Nstep>,
-                                  cgen_seq<T, Nsize - Nsize / 2, Nstart + (Nsize / 2) * Nstep, Nstep>>
+struct cvalseq_impl
+    : concat_impl<typename cvalseq_impl<T, Nsize / 2, Nstart, Nstep>::type,
+                  typename cvalseq_impl<T, Nsize - Nsize / 2,
+                                        static_cast<T>(Nstart + static_cast<ptrdiff_t>(Nsize / 2) * Nstep),
+                                        Nstep>::type>
 {
 };
 
@@ -679,6 +698,10 @@ template <typename T, T Nstart, ptrdiff_t Nstep>
 struct cvalseq_impl<T, 1, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart)>
 {
 };
+template <typename T, T Nstart, ptrdiff_t Nstep>
+struct cvalseq_impl<T, 2, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart), static_cast<T>(Nstart + Nstep)>
+{
+};
 } // namespace details
 
 template <typename T, size_t size, T start = T(), ptrdiff_t step = 1>
@@ -691,9 +714,11 @@ template <typename... List>
 using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>;
 
 template <size_t group, size_t... indices, size_t N = group * sizeof...(indices)>
-constexpr inline auto scale(csizes_t<indices...> i) noexcept
+constexpr inline auto scale(csizes_t<indices...> i) CMT_NOEXCEPT
 {
-    return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() % csize_t<group>();
+    return cconcat(csizeseq_t<group, group * indices>()...);
+    //    return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() %
+    //    csize_t<group>();
 }
 
 namespace details
@@ -814,12 +839,14 @@ constexpr inline unsigned ilog2(T n, unsigned p = 0)
     return (n <= 1) ? p : ilog2(n / 2, p + 1);
 }
 
+/// @brief Returns a nearest power of two that is greater or equal than n
 template <typename T>
 constexpr inline T next_poweroftwo(T n)
 {
     return n > 2 ? T(1) << (ilog2(n - 1) + 1) : n;
 }
 
+/// @brief Returns a nearest power of two that is less or equal than n
 template <typename T>
 constexpr inline T prev_poweroftwo(T n)
 {
@@ -1007,7 +1034,7 @@ template <>
 constexpr size_t elementsize<void>()
 {
     return 1;
-};
+}
 } // namespace details
 
 /// @brief Utility typedef used to disable type deduction
@@ -1018,7 +1045,7 @@ using identity = typename details::identity_impl<T>::type;
 struct swallow
 {
     template <typename... T>
-    CMT_INTRIN constexpr swallow(T&&...) noexcept
+    CMT_MEM_INTRINSIC constexpr swallow(T&&...) CMT_NOEXCEPT
     {
     }
 };
@@ -1029,52 +1056,52 @@ struct carray;
 template <typename T>
 struct carray<T, 1>
 {
-    CMT_INTRIN constexpr carray() noexcept = default;
-    CMT_INTRIN constexpr carray(T val) noexcept : val(val) {}
+    CMT_MEM_INTRINSIC constexpr carray() CMT_NOEXCEPT = default;
+    CMT_MEM_INTRINSIC constexpr carray(T val) CMT_NOEXCEPT : val(val) {}
 
     template <typename Fn, size_t index = 0, CMT_ENABLE_IF(is_callable<Fn, csize_t<index>>::value)>
-    CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept
+    CMT_MEM_INTRINSIC constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) CMT_NOEXCEPT
         : val(static_cast<T>(fn(csize_t<index>())))
     {
     }
 
-    CMT_INTRIN constexpr carray(const carray&) noexcept = default;
-    CMT_INTRIN constexpr carray(carray&&) noexcept      = default;
-    CMT_INTRIN static constexpr size_t size() noexcept { return 1; }
+    CMT_MEM_INTRINSIC constexpr carray(const carray&) CMT_NOEXCEPT = default;
+    CMT_MEM_INTRINSIC constexpr carray(carray&&) CMT_NOEXCEPT      = default;
+    CMT_MEM_INTRINSIC static constexpr size_t size() CMT_NOEXCEPT { return 1; }
 
     template <size_t index>
-    CMT_INTRIN constexpr T& get(csize_t<index>) noexcept
+    CMT_MEM_INTRINSIC constexpr T& get(csize_t<index>) CMT_NOEXCEPT
     {
         static_assert(index == 0, "carray: Array index is out of range");
         return val;
     }
     template <size_t index>
-    CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept
+    CMT_MEM_INTRINSIC constexpr const T& get(csize_t<index>) const CMT_NOEXCEPT
     {
         static_assert(index == 0, "carray: Array index is out of range");
         return val;
     }
     template <size_t index>
-    CMT_INTRIN constexpr T& get() noexcept
+    CMT_MEM_INTRINSIC constexpr T& get() CMT_NOEXCEPT
     {
         return get(csize_t<index>());
     }
     template <size_t index>
-    CMT_INTRIN constexpr const T& get() const noexcept
+    CMT_MEM_INTRINSIC constexpr const T& get() const CMT_NOEXCEPT
     {
         return get(csize_t<index>());
     }
-    CMT_INTRIN constexpr const T* front() const noexcept { return val; }
-    CMT_INTRIN constexpr T* front() noexcept { return val; }
-    CMT_INTRIN constexpr const T* back() const noexcept { return val; }
-    CMT_INTRIN constexpr T* back() noexcept { return val; }
-    CMT_INTRIN constexpr const T* begin() const noexcept { return &val; }
-    CMT_INTRIN constexpr const T* end() const noexcept { return &val + 1; }
-    CMT_INTRIN constexpr T* begin() noexcept { return &val; }
-    CMT_INTRIN constexpr T* end() noexcept { return &val + 1; }
-    CMT_INTRIN constexpr const T* data() const noexcept { return begin(); }
-    CMT_INTRIN constexpr T* data() noexcept { return begin(); }
-    CMT_INTRIN constexpr bool empty() const noexcept { return false; }
+    CMT_MEM_INTRINSIC constexpr const T* front() const CMT_NOEXCEPT { return val; }
+    CMT_MEM_INTRINSIC constexpr T* front() CMT_NOEXCEPT { return val; }
+    CMT_MEM_INTRINSIC constexpr const T* back() const CMT_NOEXCEPT { return val; }
+    CMT_MEM_INTRINSIC constexpr T* back() CMT_NOEXCEPT { return val; }
+    CMT_MEM_INTRINSIC constexpr const T* begin() const CMT_NOEXCEPT { return &val; }
+    CMT_MEM_INTRINSIC constexpr const T* end() const CMT_NOEXCEPT { return &val + 1; }
+    CMT_MEM_INTRINSIC constexpr T* begin() CMT_NOEXCEPT { return &val; }
+    CMT_MEM_INTRINSIC constexpr T* end() CMT_NOEXCEPT { return &val + 1; }
+    CMT_MEM_INTRINSIC constexpr const T* data() const CMT_NOEXCEPT { return begin(); }
+    CMT_MEM_INTRINSIC constexpr T* data() CMT_NOEXCEPT { return begin(); }
+    CMT_MEM_INTRINSIC constexpr bool empty() const CMT_NOEXCEPT { return false; }
     T val;
 };
 
@@ -1082,55 +1109,56 @@ template <typename T, size_t N>
 struct carray : carray<T, N - 1>
 {
     template <typename... Ts>
-    CMT_INTRIN constexpr carray(T first, Ts... list) noexcept : carray<T, N - 1>(list...), val(first)
+    CMT_MEM_INTRINSIC constexpr carray(T first, Ts... list) CMT_NOEXCEPT : carray<T, N - 1>(list...),
+                                                                           val(first)
     {
         static_assert(sizeof...(list) + 1 == N, "carray: Argument count is invalid");
     }
 
     template <typename Fn, size_t index = N - 1>
-    CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept
+    CMT_MEM_INTRINSIC constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) CMT_NOEXCEPT
         : carray<T, N - 1>(std::forward<Fn>(fn), csize_t<index - 1>()),
           val(static_cast<T>(fn(csize_t<index>())))
     {
     }
 
-    CMT_INTRIN constexpr carray() noexcept              = default;
-    CMT_INTRIN constexpr carray(const carray&) noexcept = default;
-    CMT_INTRIN constexpr carray(carray&&) noexcept      = default;
-    CMT_INTRIN static constexpr size_t size() noexcept { return N; }
-    CMT_INTRIN constexpr T& get(csize_t<N - 1>) noexcept { return val; }
+    CMT_MEM_INTRINSIC constexpr carray() CMT_NOEXCEPT              = default;
+    CMT_MEM_INTRINSIC constexpr carray(const carray&) CMT_NOEXCEPT = default;
+    CMT_MEM_INTRINSIC constexpr carray(carray&&) CMT_NOEXCEPT      = default;
+    CMT_MEM_INTRINSIC static constexpr size_t size() CMT_NOEXCEPT { return N; }
+    CMT_MEM_INTRINSIC constexpr T& get(csize_t<N - 1>) CMT_NOEXCEPT { return val; }
     template <size_t index>
-    CMT_INTRIN constexpr T& get(csize_t<index>) noexcept
+    CMT_MEM_INTRINSIC constexpr T& get(csize_t<index>) CMT_NOEXCEPT
     {
         return carray<T, N - 1>::get(csize_t<index>());
     }
-    CMT_INTRIN constexpr const T& get(csize_t<N - 1>) const noexcept { return val; }
+    CMT_MEM_INTRINSIC constexpr const T& get(csize_t<N - 1>) const CMT_NOEXCEPT { return val; }
     template <size_t index>
-    CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept
+    CMT_MEM_INTRINSIC constexpr const T& get(csize_t<index>) const CMT_NOEXCEPT
     {
         return carray<T, N - 1>::get(csize_t<index>());
     }
     template <size_t index>
-    CMT_INTRIN constexpr T& get() noexcept
+    CMT_MEM_INTRINSIC constexpr T& get() CMT_NOEXCEPT
     {
         return get(csize_t<index>());
     }
     template <size_t index>
-    CMT_INTRIN constexpr const T& get() const noexcept
+    CMT_MEM_INTRINSIC constexpr const T& get() const CMT_NOEXCEPT
     {
         return get(csize_t<index>());
     }
-    CMT_INTRIN constexpr const T* front() const noexcept { return carray<T, N - 1>::front(); }
-    CMT_INTRIN constexpr T* front() noexcept { return carray<T, N - 1>::front(); }
-    CMT_INTRIN constexpr const T* back() const noexcept { return val; }
-    CMT_INTRIN constexpr T* back() noexcept { return val; }
-    CMT_INTRIN constexpr const T* begin() const noexcept { return carray<T, N - 1>::begin(); }
-    CMT_INTRIN constexpr const T* end() const noexcept { return &val + 1; }
-    CMT_INTRIN constexpr T* begin() noexcept { return carray<T, N - 1>::begin(); }
-    CMT_INTRIN constexpr T* end() noexcept { return &val + 1; }
-    CMT_INTRIN constexpr const T* data() const noexcept { return begin(); }
-    CMT_INTRIN constexpr T* data() noexcept { return begin(); }
-    CMT_INTRIN constexpr bool empty() const noexcept { return false; }
+    CMT_MEM_INTRINSIC constexpr const T* front() const CMT_NOEXCEPT { return carray<T, N - 1>::front(); }
+    CMT_MEM_INTRINSIC constexpr T* front() CMT_NOEXCEPT { return carray<T, N - 1>::front(); }
+    CMT_MEM_INTRINSIC constexpr const T* back() const CMT_NOEXCEPT { return val; }
+    CMT_MEM_INTRINSIC constexpr T* back() CMT_NOEXCEPT { return val; }
+    CMT_MEM_INTRINSIC constexpr const T* begin() const CMT_NOEXCEPT { return carray<T, N - 1>::begin(); }
+    CMT_MEM_INTRINSIC constexpr const T* end() const CMT_NOEXCEPT { return &val + 1; }
+    CMT_MEM_INTRINSIC constexpr T* begin() CMT_NOEXCEPT { return carray<T, N - 1>::begin(); }
+    CMT_MEM_INTRINSIC constexpr T* end() CMT_NOEXCEPT { return &val + 1; }
+    CMT_MEM_INTRINSIC constexpr const T* data() const CMT_NOEXCEPT { return begin(); }
+    CMT_MEM_INTRINSIC constexpr T* data() CMT_NOEXCEPT { return begin(); }
+    CMT_MEM_INTRINSIC constexpr bool empty() const CMT_NOEXCEPT { return false; }
 
 private:
     T val;
@@ -1162,45 +1190,52 @@ private:
 
 /// @brief Function that returns its first argument
 template <typename T>
-CMT_INTRIN constexpr T&& pass_through(T&& x) noexcept
+CMT_INTRINSIC constexpr T&& pass_through(T&& x) CMT_NOEXCEPT
 {
     return std::forward<T>(x);
 }
 
 /// @brief Function that returns void and ignores all its arguments
 template <typename... Ts>
-CMT_INTRIN constexpr void noop(Ts&&...) noexcept
+CMT_INTRINSIC constexpr void noop(Ts&&...) CMT_NOEXCEPT
 {
 }
 
 /// @brief Function that returns its first argument and ignores all other arguments
 template <typename T1, typename... Ts>
-CMT_INTRIN constexpr T1&& get_first(T1&& x, Ts&&...) noexcept
+CMT_INTRINSIC constexpr T1&& get_first(T1&& x, Ts&&...) CMT_NOEXCEPT
 {
     return std::forward<T1>(x);
 }
 
 /// @brief Function that returns its second argument and ignores all other arguments
 template <typename T1, typename T2, typename... Ts>
-CMT_INTRIN constexpr T2&& get_second(T1, T2&& x, Ts&&...) noexcept
+CMT_INTRINSIC constexpr T2&& get_second(T1, T2&& x, Ts&&...) CMT_NOEXCEPT
 {
     return std::forward<T2>(x);
 }
 
 /// @brief Function that returns its third argument and ignores all other arguments
 template <typename T1, typename T2, typename T3, typename... Ts>
-CMT_INTRIN constexpr T3&& get_third(T1&&, T2&&, T3&& x, Ts&&...) noexcept
+CMT_INTRINSIC constexpr T3&& get_third(T1&&, T2&&, T3&& x, Ts&&...) CMT_NOEXCEPT
 {
     return std::forward<T3>(x);
 }
 
 /// @brief Function that returns value-initialization of type T and ignores all its arguments
 template <typename T, typename... Ts>
-CMT_INTRIN constexpr T returns(Ts&&...)
+CMT_INTRINSIC constexpr T returns(Ts&&...)
 {
     return T();
 }
 
+/// @brief Function that returns constant of type T and ignores all its arguments
+template <typename T, T value, typename... Args>
+CMT_INTRINSIC constexpr T return_constant(Args&&...)
+{
+    return value;
+}
+
 CMT_FN(pass_through)
 CMT_FN(noop)
 CMT_FN(get_first)
@@ -1208,33 +1243,43 @@ CMT_FN(get_second)
 CMT_FN(get_third)
 CMT_FN_TPL((typename T), (T), returns)
 
+template <typename T, T value>
+struct fn_return_constant
+{
+    template <typename... Args>
+    constexpr T operator()(Args&&...) const noexcept
+    {
+        return value;
+    }
+};
+
 template <typename T1, typename T2>
-CMT_INTRIN bool is_equal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_equal(const T1& x, const T2& y)
 {
     return x == y;
 }
 template <typename T1, typename T2>
-CMT_INTRIN bool is_notequal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_notequal(const T1& x, const T2& y)
 {
     return x != y;
 }
 template <typename T1, typename T2>
-CMT_INTRIN bool is_less(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_less(const T1& x, const T2& y)
 {
     return x < y;
 }
 template <typename T1, typename T2>
-CMT_INTRIN bool is_greater(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_greater(const T1& x, const T2& y)
 {
     return x > y;
 }
 template <typename T1, typename T2>
-CMT_INTRIN bool is_lessorequal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_lessorequal(const T1& x, const T2& y)
 {
     return x <= y;
 }
 template <typename T1, typename T2>
-CMT_INTRIN bool is_greaterorequal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_greaterorequal(const T1& x, const T2& y)
 {
     return x >= y;
 }
@@ -1313,7 +1358,7 @@ void cforeach_impl(Fn&& fn)
 #endif
 
 template <typename T, T... values, typename Fn>
-CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn)
+CMT_INTRINSIC void cforeach(cvals_t<T, values...>, Fn&& fn)
 {
 #ifdef CMT_COMPILER_CLANG
     swallow{ (fn(cval_t<T, values>()), void(), 0)... };
@@ -1323,7 +1368,7 @@ CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn)
 }
 
 template <typename T, typename Fn, CMT_ENABLE_IF(has_begin_end<T>::value)>
-CMT_INTRIN void cforeach(T&& list, Fn&& fn)
+CMT_INTRINSIC void cforeach(T&& list, Fn&& fn)
 {
     for (const auto& v : list)
     {
@@ -1332,7 +1377,7 @@ CMT_INTRIN void cforeach(T&& list, Fn&& fn)
 }
 
 template <typename T, size_t N, typename Fn>
-CMT_INTRIN void cforeach(const T (&array)[N], Fn&& fn)
+CMT_INTRINSIC void cforeach(const T (&array)[N], Fn&& fn)
 {
     for (size_t i = 0; i < N; i++)
     {
@@ -1344,59 +1389,94 @@ namespace details
 {
 
 template <size_t index, typename... types>
-CMT_INTRIN auto get_type_arg(ctypes_t<types...>)
+CMT_INTRINSIC auto get_type_arg(ctypes_t<types...>)
 {
     return ctype_t<type_of<details::get_nth_type<index, types...>>>();
 }
 
 template <typename T0, typename... types, typename Fn, size_t... indices>
-CMT_INTRIN void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, csizes_t<indices...>)
+CMT_INTRINSIC void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, csizes_t<indices...>)
 {
     swallow{ (fn(get_type_arg<indices>(type_list)), void(), 0)... };
 }
+template <typename Fn>
+CMT_INTRINSIC void cforeach_types_impl(ctypes_t<>, Fn&&, csizes_t<>)
+{
+}
 } // namespace details
 
 template <typename... Ts, typename Fn>
-CMT_INTRIN void cforeach(ctypes_t<Ts...> types, Fn&& fn)
+CMT_INTRINSIC void cforeach(ctypes_t<Ts...> types, Fn&& fn)
 {
     details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq_t<sizeof...(Ts)>());
 }
 
 template <typename A0, typename A1, typename Fn>
-CMT_INTRIN void cforeach(A0&& a0, A1&& a1, Fn&& fn)
+CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, Fn&& fn)
 {
-    cforeach(std::forward<A0>(a0),
-             [&](auto v0) { cforeach(std::forward<A1>(a1), [&](auto v1) { fn(v0, v1); }); });
+    // Default capture causes ICE in Intel C++
+    cforeach(std::forward<A0>(a0), //
+             [&a1, &fn](auto v0) { //
+                 cforeach(std::forward<A1>(a1), //
+                          [&v0, &fn](auto v1) { fn(v0, v1); });
+             });
 }
 
 template <typename A0, typename A1, typename A2, typename Fn>
-CMT_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
-{
-    cforeach(std::forward<A0>(a0), [&](auto v0) {
-        cforeach(std::forward<A1>(a1),
-                 [&](auto v1) { cforeach(std::forward<A2>(a2), [&](auto v2) { fn(v0, v1, v2); }); });
-    });
+CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
+{
+    // Default capture causes ICE in Intel C++
+    cforeach(std::forward<A0>(a0), //
+             [&a1, &a2, &fn](auto v0) { //
+                 cforeach(std::forward<A1>(a1), //
+                          [&v0, &a2, &fn](auto v1) { //
+                              cforeach(std::forward<A2>(a2), //
+                                       [&v0, &v1, &fn](auto v2) { //
+                                           fn(v0, v1, v2);
+                                       });
+                          });
+             });
+}
+
+template <typename A0, typename A1, typename A2, typename A3, typename Fn>
+CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, A2&& a2, A3&& a3, Fn&& fn)
+{
+    // Default capture causes ICE in Intel C++
+    cforeach(std::forward<A0>(a0), //
+             [&a1, &a2, &a3, &fn](auto v0) { //
+                 cforeach(std::forward<A1>(a1), //
+                          [&v0, &a2, &a3, &fn](auto v1) { //
+                              cforeach(std::forward<A2>(a2), //
+                                       [&v0, &v1, &a3, &fn](auto v2) { //
+                                           cforeach(std::forward<A3>(a3), //
+                                                    [&v0, &v1, &v2, &fn](auto v3) //
+                                                    { fn(v0, v1, v2, v3); });
+                                       });
+                          });
+             });
 }
+
 template <typename TrueFn, typename FalseFn = fn_noop>
-CMT_INTRIN decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn())
+CMT_INTRINSIC decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn())
 {
     return truefn(ctrue);
 }
 
 template <typename TrueFn, typename FalseFn = fn_noop>
-CMT_INTRIN decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn())
+CMT_INTRINSIC decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn())
 {
     return falsefn(cfalse);
 }
 
 template <typename T, T start, T stop, typename BodyFn>
-CMT_INTRIN decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn)
+CMT_INTRINSIC decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn)
 {
     return cforeach(cvalseq_t<T, stop - start, start>(), std::forward<BodyFn>(bodyfn));
 }
 
 template <typename T, T... vs, typename U, typename Function, typename Fallback = fn_noop>
-void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&& fallback = Fallback())
+CMT_INTRINSIC void cswitch(cvals_t<T, vs...>, const U& value, Function&& function,
+                           Fallback&& fallback = Fallback())
 {
     bool result = false;
     swallow{ (result = result || ((vs == value) ? (function(cval_t<T, vs>()), void(), true) : false), void(),
@@ -1406,14 +1486,15 @@ void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&& 
 }
 
 template <typename T, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal>
-CMT_INTRIN decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(), CmpFn&& = CmpFn())
+CMT_INTRINSIC decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(),
+                                     CmpFn&& = CmpFn())
 {
     return deffn();
 }
 
 template <typename T, T v0, T... values, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal>
-CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn,
-                                  DefFn&& deffn = DefFn(), CmpFn&& cmpfn = CmpFn())
+CMT_INTRINSIC decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn,
+                                     DefFn&& deffn = DefFn(), CmpFn&& cmpfn = CmpFn())
 {
     if (cmpfn(value, v0))
     {
@@ -1428,7 +1509,6 @@ CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, 
 
 namespace details
 {
-
 template <typename T, typename Fn1, typename Fn2, typename... Fns>
 inline decltype(auto) cmatch_impl(T&& value, Fn1&& first, Fn2&& second, Fns&&... rest);
 template <typename T, typename Fn, typename... Ts>
@@ -1491,15 +1571,15 @@ template <typename Fn>
 struct fn_noinline
 {
     template <typename... Args>
-    CMT_INTRIN result_of<Fn(Args...)> operator()(Args&&... args) const
+    CMT_MEM_INTRINSIC result_of<Fn(Args...)> operator()(Args&&... args) const
     {
         return noinline(Fn{}, std::forward<Args>(args)...);
     }
-};
+}; // namespace cometa
 
 template <typename... Args, typename Fn, typename Ret = decltype(std::declval<Fn>()(std::declval<Args>()...)),
           typename NonMemFn = Ret (*)(Fn*, Args...)>
-CMT_INTRIN NonMemFn make_nonmember(const Fn&)
+CMT_INTRINSIC NonMemFn make_nonmember(const Fn&)
 {
     return [](Fn* fn, Args... args) -> Ret { return fn->operator()(std::forward<Args>(args)...); };
 }
@@ -1510,6 +1590,11 @@ constexpr inline T choose_const()
     static_assert(sizeof(T) != 0, "T not found in the list of template arguments");
     return T();
 }
+template <typename T, typename C1>
+constexpr inline T choose_const_fallback(C1 c1)
+{
+    return static_cast<T>(c1);
+}
 
 /**
  * Selects constant of the specific type
@@ -1518,10 +1603,21 @@ constexpr inline T choose_const()
  * CHECK( choose_const<f64>( 32.0f, 64.0 ) == 64.0 );
  * @endcode
  */
+template <typename T, typename C1, typename... Cs, CMT_ENABLE_IF(std::is_same<T, C1>::value)>
+constexpr inline T choose_const(C1 c1, Cs...)
+{
+    return static_cast<T>(c1);
+}
+template <typename T, typename C1, typename... Cs, CMT_ENABLE_IF(!std::is_same<T, C1>::value)>
+constexpr inline T choose_const(C1, Cs... constants)
+{
+    return choose_const<T>(constants...);
+}
+
 template <typename T, typename C1, typename... Cs>
-constexpr inline T choose_const(C1 c1, Cs... constants)
+constexpr inline T choose_const_fallback(C1 c1, Cs... constants)
 {
-    return std::is_same<T, C1>::value ? static_cast<T>(c1) : choose_const<T>(constants...);
+    return std::is_same<T, C1>::value ? static_cast<T>(c1) : choose_const_fallback<T>(constants...);
 }
 
 template <typename Tfrom>
@@ -1529,14 +1625,14 @@ struct autocast_impl
 {
     const Tfrom value;
     template <typename T>
-    CMT_INTRIN constexpr operator T() const noexcept
+    CMT_MEM_INTRINSIC constexpr operator T() const CMT_NOEXCEPT
     {
         return static_cast<T>(value);
     }
 };
 
 template <typename Tfrom>
-CMT_INTRIN constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) noexcept
+CMT_INTRINSIC constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) CMT_NOEXCEPT
 {
     return { value };
 }
@@ -1603,49 +1699,49 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wundefined-reinterpret-cast")
 #endif
 
 template <typename T, typename U>
-CMT_INLINE constexpr static T& ref_cast(U& ptr)
+CMT_INTRINSIC constexpr static T& ref_cast(U& ptr)
 {
     return reinterpret_cast<T&>(ptr);
 }
 
 template <typename T, typename U>
-CMT_INLINE constexpr static const T& ref_cast(const U& ptr)
+CMT_INTRINSIC constexpr static const T& ref_cast(const U& ptr)
 {
     return reinterpret_cast<const T&>(ptr);
 }
 
 template <typename T, typename U>
-CMT_INLINE constexpr static T* ptr_cast(U* ptr)
+CMT_INTRINSIC constexpr static T* ptr_cast(U* ptr)
 {
     return reinterpret_cast<T*>(ptr);
 }
 
 template <typename T, typename U>
-CMT_INLINE constexpr static const T* ptr_cast(const U* ptr)
+CMT_INTRINSIC constexpr static const T* ptr_cast(const U* ptr)
 {
     return reinterpret_cast<const T*>(ptr);
 }
 
 template <typename T, typename U>
-CMT_INLINE constexpr static T* ptr_cast(U* ptr, ptrdiff_t offset)
+CMT_INTRINSIC constexpr static T* ptr_cast(U* ptr, ptrdiff_t offset)
 {
     return ptr_cast<T>(ptr_cast<unsigned char>(ptr) + offset);
 }
 
 template <typename T, typename U>
-CMT_INLINE constexpr static T* derived_cast(U* ptr)
+CMT_INTRINSIC constexpr static T* derived_cast(U* ptr)
 {
     return static_cast<T*>(ptr);
 }
 
 template <typename T, typename U>
-CMT_INLINE constexpr static const T* derived_cast(const U* ptr)
+CMT_INTRINSIC constexpr static const T* derived_cast(const U* ptr)
 {
     return static_cast<const T*>(ptr);
 }
 
 template <typename T, typename U>
-CMT_INLINE constexpr static T implicit_cast(U&& value)
+CMT_INTRINSIC constexpr static T implicit_cast(U&& value)
 {
     return std::forward<T>(value);
 }
@@ -1751,6 +1847,228 @@ constexpr conditional<std::is_scalar<T>::value, T, const T&> const_min(const T& 
     return x < y ? x : y;
 }
 
+template <int n = 10>
+struct overload_priority : overload_priority<n - 1>
+{
+};
+
+template <>
+struct overload_priority<0>
+{
+};
+
+constexpr overload_priority<> overload_auto{};
+
+using overload_generic = overload_priority<0>;
+
+#define CMT_GEN_LIST1(m, ...) m(0, __VA_ARGS__)
+#define CMT_GEN_LIST2(m, ...) CMT_GEN_LIST1(m, __VA_ARGS__), m(1, __VA_ARGS__)
+#define CMT_GEN_LIST3(m, ...) CMT_GEN_LIST2(m, __VA_ARGS__), m(2, __VA_ARGS__)
+#define CMT_GEN_LIST4(m, ...) CMT_GEN_LIST3(m, __VA_ARGS__), m(3, __VA_ARGS__)
+#define CMT_GEN_LIST5(m, ...) CMT_GEN_LIST4(m, __VA_ARGS__), m(4, __VA_ARGS__)
+#define CMT_GEN_LIST6(m, ...) CMT_GEN_LIST5(m, __VA_ARGS__), m(5, __VA_ARGS__)
+#define CMT_GEN_LIST7(m, ...) CMT_GEN_LIST6(m, __VA_ARGS__), m(6, __VA_ARGS__)
+#define CMT_GEN_LIST8(m, ...) CMT_GEN_LIST7(m, __VA_ARGS__), m(7, __VA_ARGS__)
+#define CMT_GEN_LIST9(m, ...) CMT_GEN_LIST8(m, __VA_ARGS__), m(8, __VA_ARGS__)
+#define CMT_GEN_LIST10(m, ...) CMT_GEN_LIST9(m, __VA_ARGS__), m(9, __VA_ARGS__)
+
+#define CMT_GEN_LIST11(m, ...) CMT_GEN_LIST10(m, __VA_ARGS__), m(10, __VA_ARGS__)
+#define CMT_GEN_LIST12(m, ...) CMT_GEN_LIST11(m, __VA_ARGS__), m(11, __VA_ARGS__)
+#define CMT_GEN_LIST13(m, ...) CMT_GEN_LIST12(m, __VA_ARGS__), m(12, __VA_ARGS__)
+#define CMT_GEN_LIST14(m, ...) CMT_GEN_LIST13(m, __VA_ARGS__), m(13, __VA_ARGS__)
+#define CMT_GEN_LIST15(m, ...) CMT_GEN_LIST14(m, __VA_ARGS__), m(14, __VA_ARGS__)
+#define CMT_GEN_LIST16(m, ...) CMT_GEN_LIST15(m, __VA_ARGS__), m(15, __VA_ARGS__)
+#define CMT_GEN_LIST17(m, ...) CMT_GEN_LIST16(m, __VA_ARGS__), m(16, __VA_ARGS__)
+#define CMT_GEN_LIST18(m, ...) CMT_GEN_LIST17(m, __VA_ARGS__), m(17, __VA_ARGS__)
+#define CMT_GEN_LIST19(m, ...) CMT_GEN_LIST18(m, __VA_ARGS__), m(18, __VA_ARGS__)
+#define CMT_GEN_LIST20(m, ...) CMT_GEN_LIST19(m, __VA_ARGS__), m(19, __VA_ARGS__)
+
+#define CMT_GEN_LIST21(m, ...) CMT_GEN_LIST20(m, __VA_ARGS__), m(20, __VA_ARGS__)
+#define CMT_GEN_LIST22(m, ...) CMT_GEN_LIST21(m, __VA_ARGS__), m(21, __VA_ARGS__)
+#define CMT_GEN_LIST23(m, ...) CMT_GEN_LIST22(m, __VA_ARGS__), m(22, __VA_ARGS__)
+#define CMT_GEN_LIST24(m, ...) CMT_GEN_LIST23(m, __VA_ARGS__), m(23, __VA_ARGS__)
+#define CMT_GEN_LIST25(m, ...) CMT_GEN_LIST24(m, __VA_ARGS__), m(24, __VA_ARGS__)
+#define CMT_GEN_LIST26(m, ...) CMT_GEN_LIST25(m, __VA_ARGS__), m(25, __VA_ARGS__)
+#define CMT_GEN_LIST27(m, ...) CMT_GEN_LIST26(m, __VA_ARGS__), m(26, __VA_ARGS__)
+#define CMT_GEN_LIST28(m, ...) CMT_GEN_LIST27(m, __VA_ARGS__), m(27, __VA_ARGS__)
+#define CMT_GEN_LIST29(m, ...) CMT_GEN_LIST28(m, __VA_ARGS__), m(28, __VA_ARGS__)
+#define CMT_GEN_LIST30(m, ...) CMT_GEN_LIST29(m, __VA_ARGS__), m(29, __VA_ARGS__)
+
+#define CMT_GEN_LIST31(m, ...) CMT_GEN_LIST30(m, __VA_ARGS__), m(30, __VA_ARGS__)
+#define CMT_GEN_LIST32(m, ...) CMT_GEN_LIST31(m, __VA_ARGS__), m(31, __VA_ARGS__)
+#define CMT_GEN_LIST33(m, ...) CMT_GEN_LIST32(m, __VA_ARGS__), m(32, __VA_ARGS__)
+#define CMT_GEN_LIST34(m, ...) CMT_GEN_LIST33(m, __VA_ARGS__), m(33, __VA_ARGS__)
+#define CMT_GEN_LIST35(m, ...) CMT_GEN_LIST34(m, __VA_ARGS__), m(34, __VA_ARGS__)
+#define CMT_GEN_LIST36(m, ...) CMT_GEN_LIST35(m, __VA_ARGS__), m(35, __VA_ARGS__)
+#define CMT_GEN_LIST37(m, ...) CMT_GEN_LIST36(m, __VA_ARGS__), m(36, __VA_ARGS__)
+#define CMT_GEN_LIST38(m, ...) CMT_GEN_LIST37(m, __VA_ARGS__), m(37, __VA_ARGS__)
+#define CMT_GEN_LIST39(m, ...) CMT_GEN_LIST38(m, __VA_ARGS__), m(38, __VA_ARGS__)
+#define CMT_GEN_LIST40(m, ...) CMT_GEN_LIST39(m, __VA_ARGS__), m(39, __VA_ARGS__)
+
+#define CMT_GEN_LIST41(m, ...) CMT_GEN_LIST40(m, __VA_ARGS__), m(40, __VA_ARGS__)
+#define CMT_GEN_LIST42(m, ...) CMT_GEN_LIST41(m, __VA_ARGS__), m(41, __VA_ARGS__)
+#define CMT_GEN_LIST43(m, ...) CMT_GEN_LIST42(m, __VA_ARGS__), m(42, __VA_ARGS__)
+#define CMT_GEN_LIST44(m, ...) CMT_GEN_LIST43(m, __VA_ARGS__), m(43, __VA_ARGS__)
+#define CMT_GEN_LIST45(m, ...) CMT_GEN_LIST44(m, __VA_ARGS__), m(44, __VA_ARGS__)
+#define CMT_GEN_LIST46(m, ...) CMT_GEN_LIST45(m, __VA_ARGS__), m(45, __VA_ARGS__)
+#define CMT_GEN_LIST47(m, ...) CMT_GEN_LIST46(m, __VA_ARGS__), m(46, __VA_ARGS__)
+#define CMT_GEN_LIST48(m, ...) CMT_GEN_LIST47(m, __VA_ARGS__), m(47, __VA_ARGS__)
+#define CMT_GEN_LIST49(m, ...) CMT_GEN_LIST48(m, __VA_ARGS__), m(48, __VA_ARGS__)
+#define CMT_GEN_LIST50(m, ...) CMT_GEN_LIST49(m, __VA_ARGS__), m(49, __VA_ARGS__)
+
+#define CMT_GEN_LIST51(m, ...) CMT_GEN_LIST50(m, __VA_ARGS__), m(50, __VA_ARGS__)
+#define CMT_GEN_LIST52(m, ...) CMT_GEN_LIST51(m, __VA_ARGS__), m(51, __VA_ARGS__)
+#define CMT_GEN_LIST53(m, ...) CMT_GEN_LIST52(m, __VA_ARGS__), m(52, __VA_ARGS__)
+#define CMT_GEN_LIST54(m, ...) CMT_GEN_LIST53(m, __VA_ARGS__), m(53, __VA_ARGS__)
+#define CMT_GEN_LIST55(m, ...) CMT_GEN_LIST54(m, __VA_ARGS__), m(54, __VA_ARGS__)
+#define CMT_GEN_LIST56(m, ...) CMT_GEN_LIST55(m, __VA_ARGS__), m(55, __VA_ARGS__)
+#define CMT_GEN_LIST57(m, ...) CMT_GEN_LIST56(m, __VA_ARGS__), m(56, __VA_ARGS__)
+#define CMT_GEN_LIST58(m, ...) CMT_GEN_LIST57(m, __VA_ARGS__), m(57, __VA_ARGS__)
+#define CMT_GEN_LIST59(m, ...) CMT_GEN_LIST58(m, __VA_ARGS__), m(58, __VA_ARGS__)
+#define CMT_GEN_LIST60(m, ...) CMT_GEN_LIST59(m, __VA_ARGS__), m(59, __VA_ARGS__)
+
+#define CMT_GEN_LIST61(m, ...) CMT_GEN_LIST60(m, __VA_ARGS__), m(60, __VA_ARGS__)
+#define CMT_GEN_LIST62(m, ...) CMT_GEN_LIST61(m, __VA_ARGS__), m(61, __VA_ARGS__)
+#define CMT_GEN_LIST63(m, ...) CMT_GEN_LIST62(m, __VA_ARGS__), m(62, __VA_ARGS__)
+#define CMT_GEN_LIST64(m, ...) CMT_GEN_LIST63(m, __VA_ARGS__), m(63, __VA_ARGS__)
+#define CMT_GEN_LIST65(m, ...) CMT_GEN_LIST64(m, __VA_ARGS__), m(64, __VA_ARGS__)
+#define CMT_GEN_LIST66(m, ...) CMT_GEN_LIST65(m, __VA_ARGS__), m(65, __VA_ARGS__)
+#define CMT_GEN_LIST67(m, ...) CMT_GEN_LIST66(m, __VA_ARGS__), m(66, __VA_ARGS__)
+#define CMT_GEN_LIST68(m, ...) CMT_GEN_LIST67(m, __VA_ARGS__), m(67, __VA_ARGS__)
+#define CMT_GEN_LIST69(m, ...) CMT_GEN_LIST68(m, __VA_ARGS__), m(68, __VA_ARGS__)
+#define CMT_GEN_LIST70(m, ...) CMT_GEN_LIST69(m, __VA_ARGS__), m(69, __VA_ARGS__)
+
+#define CMT_GEN_LIST(c, m, ...) CMT_GEN_LIST##c(m, __VA_ARGS__)
+
+template <typename Tout, typename Tin>
+constexpr CMT_INLINE Tout bitcast_anything(const Tin& in)
+{
+    static_assert(sizeof(Tin) == sizeof(Tout), "Invalid arguments for bitcast_anything");
+#ifdef CMT_COMPILER_INTEL
+    const union {
+        const Tin in;
+        Tout out;
+    } u{ in };
+    return u.out;
+#else
+    union {
+        Tin in;
+        Tout out;
+    } u{ in };
+    return u.out;
+#endif
+}
+
+template <typename T>
+constexpr T dont_deduce(T x)
+{
+    return x;
+}
+
+template <typename Ty, typename T>
+constexpr T just_value(T value)
+{
+    return value;
+}
+
+enum class special_constant
+{
+    undefined,
+    default_constructed,
+    infinity,
+    neg_infinity,
+    min,
+    max,
+    neg_max,
+    lowest,
+    epsilon,
+    integer,
+    floating_point,
+    random_bits,
+};
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4700))
+CMT_PRAGMA_MSVC(warning(disable : 4146))
+struct special_value
+{
+    constexpr special_value(const special_value&) = default;
+    constexpr special_value(special_constant c) : c(c), ll(0), d(0) {}
+    constexpr special_value(double d) : c(special_constant::floating_point), ll(0), d(d) {}
+    constexpr special_value(long long ll) : c(special_constant::integer), ll(ll), d(0) {}
+    constexpr special_value(int i) : c(special_constant::integer), ll(i), d(0) {}
+
+    template <typename T>
+    constexpr T get() const CMT_NOEXCEPT
+    {
+        switch (c)
+        {
+            CMT_PRAGMA_GNU(GCC diagnostic push)
+            CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wuninitialized")
+            CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized")
+        case special_constant::undefined:
+            T undef;
+            return undef;
+            CMT_PRAGMA_GNU(GCC diagnostic pop)
+        case special_constant::default_constructed:
+            return T{};
+        case special_constant::infinity:
+            return std::numeric_limits<subtype<T>>::infinity();
+        case special_constant::neg_infinity:
+        {
+            subtype<T> gg = std::numeric_limits<subtype<T>>::infinity();
+            return -gg;
+        }
+        case special_constant::min:
+            return std::numeric_limits<subtype<T>>::min();
+        case special_constant::max:
+            return std::numeric_limits<subtype<T>>::max();
+        case special_constant::neg_max:
+            return static_cast<T>(-std::numeric_limits<subtype<T>>::max());
+        case special_constant::lowest:
+            return std::numeric_limits<subtype<T>>::lowest();
+        case special_constant::integer:
+            return static_cast<T>(ll);
+        case special_constant::floating_point:
+            return static_cast<T>(d);
+        case special_constant::random_bits:
+            return random_bits<T>();
+        default:
+            return T{};
+        }
+    }
+
+    template <typename T>
+    constexpr operator T() const CMT_NOEXCEPT
+    {
+        return get<T>();
+    }
+    special_constant c;
+    long long ll;
+    double d;
+
+    static std::mt19937& random_generator()
+    {
+        static std::mt19937 rnd(1);
+        return rnd;
+    }
+
+    template <typename T>
+    static T random_bits()
+    {
+        union {
+            uint32_t bits[(sizeof(T) + sizeof(uint32_t) - 1) / sizeof(uint32_t)];
+            T value;
+        } u;
+        for (uint32_t& b : u.bits)
+        {
+            b = random_generator()();
+        }
+        return u.value;
+    }
+};
+CMT_PRAGMA_MSVC(warning(pop))
+
 CMT_PRAGMA_GNU(GCC diagnostic pop)
 } // namespace cometa
 
diff --git a/include/kfr/cometa/array.hpp b/include/kfr/cometa/array.hpp
@@ -28,31 +28,32 @@ public:
     using size_type              = std::size_t;
     using difference_type        = std::ptrdiff_t;
 
-    constexpr array_ref() noexcept : m_data(nullptr), m_size(0) {}
-    constexpr array_ref(const array_ref&) noexcept = default;
-    constexpr array_ref(array_ref&&) noexcept      = default;
+    constexpr array_ref() CMT_NOEXCEPT : m_data(nullptr), m_size(0) {}
+    constexpr array_ref(const array_ref&) CMT_NOEXCEPT = default;
+    constexpr array_ref(array_ref&&) CMT_NOEXCEPT      = default;
 #ifdef CMT_COMPILER_GNU
-    constexpr array_ref& operator=(const array_ref&) noexcept = default;
-    constexpr array_ref& operator=(array_ref&&) noexcept = default;
+    constexpr array_ref& operator=(const array_ref&) CMT_NOEXCEPT = default;
+    constexpr array_ref& operator=(array_ref&&) CMT_NOEXCEPT = default;
 #else
     array_ref& operator=(const array_ref&) = default;
     array_ref& operator=(array_ref&&) = default;
 #endif
 
     template <size_t N>
-    constexpr array_ref(value_type (&arr)[N]) noexcept : m_data(arr), m_size(N)
+    constexpr array_ref(value_type (&arr)[N]) CMT_NOEXCEPT : m_data(arr), m_size(N)
     {
     }
     template <size_t N>
-    constexpr array_ref(const std::array<T, N>& arr) noexcept : m_data(arr.data()), m_size(N)
+    constexpr array_ref(const std::array<T, N>& arr) CMT_NOEXCEPT : m_data(arr.data()), m_size(N)
     {
     }
     template <size_t N>
-    constexpr array_ref(std::array<T, N>& arr) noexcept : m_data(arr.data()), m_size(N)
+    constexpr array_ref(std::array<T, N>& arr) CMT_NOEXCEPT : m_data(arr.data()), m_size(N)
     {
     }
     template <typename Alloc>
-    constexpr array_ref(const std::vector<T, Alloc>& vec) noexcept : m_data(vec.data()), m_size(vec.size())
+    constexpr array_ref(const std::vector<T, Alloc>& vec) CMT_NOEXCEPT : m_data(vec.data()),
+                                                                         m_size(vec.size())
     {
     }
 
@@ -61,26 +62,26 @@ public:
     {
     }
 
-    constexpr array_ref(const std::initializer_list<T>& vec) noexcept
-        : m_data(vec.begin()), m_size(vec.size())
+    constexpr array_ref(const std::initializer_list<T>& vec) CMT_NOEXCEPT : m_data(vec.begin()),
+                                                                            m_size(vec.size())
     {
     }
     template <typename InputIter>
-    constexpr array_ref(InputIter first, InputIter last) noexcept
-        : m_data(std::addressof(*first)), m_size(std::distance(first, last))
+    constexpr array_ref(InputIter first, InputIter last) CMT_NOEXCEPT : m_data(std::addressof(*first)),
+                                                                        m_size(std::distance(first, last))
     {
     }
-    constexpr array_ref(T* data, size_type size) noexcept : m_data(data), m_size(size) {}
-
-    constexpr reference front() const noexcept { return m_data[0]; }
-    constexpr reference back() const noexcept { return m_data[m_size - 1]; }
-    constexpr iterator begin() const noexcept { return m_data; }
-    constexpr iterator end() const noexcept { return m_data + m_size; }
-    constexpr const_iterator cbegin() const noexcept { return m_data; }
-    constexpr const_iterator cend() const noexcept { return m_data + m_size; }
-    constexpr pointer data() const noexcept { return m_data; }
-    constexpr std::size_t size() const noexcept { return m_size; }
-    constexpr bool empty() const noexcept { return !m_size; }
+    constexpr array_ref(T* data, size_type size) CMT_NOEXCEPT : m_data(data), m_size(size) {}
+
+    constexpr reference front() const CMT_NOEXCEPT { return m_data[0]; }
+    constexpr reference back() const CMT_NOEXCEPT { return m_data[m_size - 1]; }
+    constexpr iterator begin() const CMT_NOEXCEPT { return m_data; }
+    constexpr iterator end() const CMT_NOEXCEPT { return m_data + m_size; }
+    constexpr const_iterator cbegin() const CMT_NOEXCEPT { return m_data; }
+    constexpr const_iterator cend() const CMT_NOEXCEPT { return m_data + m_size; }
+    constexpr pointer data() const CMT_NOEXCEPT { return m_data; }
+    constexpr std::size_t size() const CMT_NOEXCEPT { return m_size; }
+    constexpr bool empty() const CMT_NOEXCEPT { return !m_size; }
     constexpr reference operator[](std::size_t index) const { return m_data[index]; }
 
 private:
@@ -126,22 +127,22 @@ inline array_ref<const T> make_array_ref(const std::vector<T>& cont)
 }
 
 template <typename C>
-constexpr auto datatype(C& c)
+constexpr auto elementtype(C& c)
 {
     return c[0];
 }
 template <typename C>
-constexpr auto datatype(const C& c)
+constexpr auto elementtype(const C& c)
 {
     return c[0];
 }
 template <typename E>
-constexpr E datatype(const std::initializer_list<E>& il)
+constexpr E elementtype(const std::initializer_list<E>&)
 {
     return {};
 }
 template <typename T, std::size_t N>
-constexpr T datatype(T (&array)[N])
+constexpr T elementtype(T (&)[N])
 {
     return {};
 }
@@ -157,17 +158,17 @@ constexpr auto data(const C& c) -> decltype(c.data())
     return c.data();
 }
 template <typename T, std::size_t N>
-constexpr T* data(T (&array)[N]) noexcept
+constexpr T* data(T (&array)[N]) CMT_NOEXCEPT
 {
     return array;
 }
 template <typename T>
-constexpr T* data(T* array) noexcept
+constexpr T* data(T* array) CMT_NOEXCEPT
 {
     return array;
 }
 template <typename E>
-constexpr const E* data(const std::initializer_list<E>& il) noexcept
+constexpr const E* data(const std::initializer_list<E>& il) CMT_NOEXCEPT
 {
     return il.begin();
 }
@@ -178,7 +179,7 @@ constexpr auto size(const C& c) -> decltype(c.size())
     return c.size();
 }
 template <typename T, std::size_t N>
-constexpr std::size_t size(const T (&array)[N]) noexcept
+constexpr std::size_t size(const T (&)[N]) CMT_NOEXCEPT
 {
     return N;
 }
diff --git a/include/kfr/cometa/cstring.hpp b/include/kfr/cometa/cstring.hpp
@@ -24,48 +24,48 @@ struct cstring
     using value_type = char;
     using size_type  = size_t;
 
-    constexpr const value_type* c_str() const noexcept { return value; }
-    constexpr const value_type* data() const noexcept { return value; }
+    constexpr const value_type* c_str() const CMT_NOEXCEPT { return value; }
+    constexpr const value_type* data() const CMT_NOEXCEPT { return value; }
 
     const value_type value[N];
-    constexpr size_type length() const noexcept { return N - 1; }
-    constexpr size_type size() const noexcept { return N; }
+    constexpr size_type length() const CMT_NOEXCEPT { return N - 1; }
+    constexpr size_type size() const CMT_NOEXCEPT { return N; }
 
     template <size_t start, size_t count>
-    constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const noexcept
+    constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const CMT_NOEXCEPT
     {
-        return slice_impl(csizeseq_t<count, start>());
+        return slice_impl(csizeseq<count, start>);
     }
 
     template <size_t start>
-    constexpr cstring<N - start> slice(csize_t<start>) const noexcept
+    constexpr cstring<N - start> slice(csize_t<start>) const CMT_NOEXCEPT
     {
-        return slice_impl(csizeseq_t<N - 1 - start, start>());
+        return slice_impl(csizeseq<N - 1 - start, start>);
     }
 
-    constexpr friend bool operator==(const cstring& left, const cstring& right) noexcept
+    constexpr friend bool operator==(const cstring& left, const cstring& right) CMT_NOEXCEPT
     {
         for (size_t i = 0; i < 1; i++)
             if (left.value[i] != right.value[i])
                 return false;
         return true;
     }
-    constexpr friend bool operator!=(const cstring& left, const cstring& right) noexcept
+    constexpr friend bool operator!=(const cstring& left, const cstring& right) CMT_NOEXCEPT
     {
         return !(left == right);
     }
 
     template <size_t NN>
-    constexpr bool operator==(const cstring<NN>& other) const noexcept
+    constexpr bool operator==(const cstring<NN>&) const CMT_NOEXCEPT
     {
         return false;
     }
     template <size_t NN>
-    constexpr bool operator!=(const cstring<NN>& other) const noexcept
+    constexpr bool operator!=(const cstring<NN>&) const CMT_NOEXCEPT
     {
         return true;
     }
-    constexpr char operator[](size_t index) const noexcept { return value[index]; }
+    constexpr char operator[](size_t index) const CMT_NOEXCEPT { return value[index]; }
 
 private:
     template <size_t... indices>
@@ -98,9 +98,9 @@ CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<
     return concat_str_impl(str1, str2, cvalseq_t<size_t, N1 - 1 + N2 - 1>());
 }
 template <size_t N1, size_t Nfrom, size_t Nto, size_t... indices>
-CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str,
-                                                      const cstring<Nfrom>&, const cstring<Nto>& to,
-                                                      csizes_t<indices...>)
+CMT_INTRINSIC cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str,
+                                                         const cstring<Nfrom>&, const cstring<Nto>& to,
+                                                         csizes_t<indices...>)
 {
     if (pos == size_t(-1))
         stop_constexpr();
@@ -111,35 +111,35 @@ CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<
 }
 } // namespace details
 
-CMT_INTRIN constexpr cstring<1> concat_cstring() { return { { 0 } }; }
+CMT_INTRINSIC constexpr cstring<1> concat_cstring() { return { { 0 } }; }
 
 template <size_t N1>
-CMT_INTRIN constexpr cstring<N1> concat_cstring(const cstring<N1>& str1)
+CMT_INTRINSIC constexpr cstring<N1> concat_cstring(const cstring<N1>& str1)
 {
     return str1;
 }
 
 template <size_t N1, size_t N2, typename... Args>
-CMT_INTRIN constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2,
-                                         const Args&... args)
+CMT_INTRINSIC constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2,
+                                            const Args&... args)
 {
     return details::concat_str_impl(str1, concat_cstring(str2, args...));
 }
 
 template <size_t N>
-CMT_INTRIN constexpr cstring<N> make_cstring(const char (&str)[N])
+CMT_INTRINSIC constexpr cstring<N> make_cstring(const char (&str)[N])
 {
     return details::make_cstring_impl(str, cvalseq_t<size_t, N - 1>());
 }
 
 template <char... chars>
-CMT_INTRIN constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>)
+CMT_INTRINSIC constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>)
 {
     return { { chars..., 0 } };
 }
 
 template <size_t N1, size_t Nneedle>
-CMT_INTRIN size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needle)
+CMT_INTRINSIC size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needle)
 {
     size_t count = 0;
     for (size_t i = 0; i < N1; i++)
@@ -155,8 +155,8 @@ CMT_INTRIN size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needl
 }
 
 template <size_t N1, size_t Nfrom, size_t Nto>
-CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from,
-                                                 const cstring<Nto>& to)
+CMT_INTRINSIC cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from,
+                                                    const cstring<Nto>& to)
 {
     return details::str_replace_impl(str_find(str, from), str, from, to,
                                      cvalseq_t<size_t, N1 - Nfrom + Nto - 1>());
diff --git a/include/kfr/cometa/ctti.hpp b/include/kfr/cometa/ctti.hpp
@@ -12,7 +12,7 @@ using pconstvoid = const void*;
 
 struct type_id_t
 {
-    constexpr type_id_t(const void* id) noexcept : id(id) {}
+    constexpr type_id_t(const void* id) CMT_NOEXCEPT : id(id) {}
     constexpr bool operator==(type_id_t other) const { return id == other.id; }
     constexpr bool operator!=(type_id_t other) const { return !(id == other.id); }
     const void* const id;
@@ -22,7 +22,7 @@ namespace details
 {
 
 template <typename T>
-constexpr inline type_id_t typeident_impl() noexcept
+constexpr inline type_id_t typeident_impl() CMT_NOEXCEPT
 {
     return type_id_t(pconstvoid(&typeident_impl<T>));
 }
@@ -30,21 +30,32 @@ constexpr inline type_id_t typeident_impl() noexcept
 #ifdef CMT_COMPILER_CLANG
 constexpr size_t typename_prefix  = sizeof("auto cometa::ctype_name() [T = ") - 1;
 constexpr size_t typename_postfix = sizeof("]") - 1;
+#elif CMT_COMPILER_MSVC
+constexpr size_t typename_prefix  = sizeof("auto __cdecl cometa::ctype_name<") - 1;
+constexpr size_t typename_postfix = sizeof(">(void) noexcept") - 1;
 #else
 constexpr size_t typename_prefix  = sizeof("constexpr auto cometa::ctype_name() [with T = ") - 1;
 constexpr size_t typename_postfix = sizeof("]") - 1;
 #endif
 
 template <size_t... indices, size_t Nout = 1 + sizeof...(indices)>
-constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) noexcept
+constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) CMT_NOEXCEPT
 {
     return cstring<Nout>{ { (str[indices])..., 0 } };
 }
 } // namespace details
 
+#ifdef CMT_COMPILER_MSVC
+#define KFR_CALL_CONV_SPEC __cdecl
+#else
+#define KFR_CALL_CONV_SPEC
+#endif
+
 template <typename T>
-constexpr auto ctype_name() noexcept
+constexpr auto KFR_CALL_CONV_SPEC ctype_name() CMT_NOEXCEPT
 {
+    static_assert(details::typename_prefix + details::typename_postfix + 1 <= sizeof(CMT_FUNC_SIGNATURE) - 1,
+                  "Incorrect details::typename_prefix or details::typename_postfix");
     return details::gettypename_impl(CMT_FUNC_SIGNATURE + details::typename_prefix,
                                      csizeseq_t<(sizeof(CMT_FUNC_SIGNATURE) - 1 - details::typename_prefix -
                                                  details::typename_postfix)>());
@@ -57,7 +68,7 @@ constexpr auto ctype_name() noexcept
  * @return      name of the type
  */
 template <typename T>
-inline const char* type_name() noexcept
+inline const char* type_name() CMT_NOEXCEPT
 {
     static const auto name = ctype_name<T>();
     return name.c_str();
@@ -70,7 +81,7 @@ inline const char* type_name() noexcept
  * @return      name of the type
  */
 template <typename T>
-inline const char* type_name(T x) noexcept
+inline const char* type_name(T x) CMT_NOEXCEPT
 {
     (void)x;
     return type_name<T>();
diff --git a/include/kfr/cometa/function.hpp b/include/kfr/cometa/function.hpp
@@ -16,20 +16,20 @@ struct virtual_function
 {
     virtual Result operator()(Args... args)     = 0;
     virtual virtual_function* make_copy() const = 0;
-    CMT_INTRIN virtual ~virtual_function()      = default;
+    virtual ~virtual_function()                 = default;
 };
 
 template <typename Fn, typename Result, typename... Args>
 struct virtual_function_impl : virtual_function<Result, Args...>
 {
 public:
-    CMT_INTRIN virtual_function_impl(const Fn& fn) : fn(fn) {}
-    CMT_INTRIN Result operator()(Args... args) override final { return fn(args...); }
-    CMT_INTRIN virtual_function<Result, Args...>* make_copy() const override final
+    CMT_MEM_INTRINSIC virtual_function_impl(const Fn& fn) : fn(fn) {}
+    CMT_MEM_INTRINSIC Result operator()(Args... args) final { return fn(args...); }
+    CMT_MEM_INTRINSIC virtual_function<Result, Args...>* make_copy() const final
     {
         return new virtual_function_impl{ fn };
     }
-    CMT_INTRIN ~virtual_function_impl() {}
+    CMT_MEM_INTRINSIC ~virtual_function_impl() {}
 
 private:
     Fn fn;
@@ -47,13 +47,13 @@ struct func_filter<Result(Args...)>
 };
 
 template <typename T>
-constexpr CMT_INTRIN T return_val() noexcept
+constexpr CMT_INTRINSIC T return_val() CMT_NOEXCEPT
 {
     return {};
 }
 
 template <>
-constexpr CMT_INTRIN void return_val<void>() noexcept
+constexpr CMT_INTRINSIC void return_val<void>() CMT_NOEXCEPT
 {
 }
 } // namespace details
@@ -81,16 +81,16 @@ struct function<Result(Args...)>
         return *this;
     }
 
-    CMT_INTRIN function() : fn(nullptr) {}
-    CMT_INTRIN function(std::nullptr_t) : fn(nullptr) {}
+    CMT_MEM_INTRINSIC function() : fn(nullptr) {}
+    CMT_MEM_INTRINSIC function(std::nullptr_t) : fn(nullptr) {}
     template <typename Func>
-    CMT_INTRIN function(const Func& x)
+    CMT_MEM_INTRINSIC function(const Func& x)
         : fn(new details::virtual_function_impl<typename details::func_filter<Func>::type, Result, Args...>(
               x))
     {
     }
     function(const this_t& other) : fn(other.fn ? other.fn->make_copy() : nullptr) {}
-    CMT_INTRIN function& operator=(const this_t& other)
+    CMT_MEM_INTRINSIC function& operator=(const this_t& other)
     {
         if ((&other != this) && (other.fn))
         {
@@ -100,14 +100,14 @@ struct function<Result(Args...)>
         }
         return *this;
     }
-    CMT_INTRIN function& operator=(std::nullptr_t)
+    CMT_MEM_INTRINSIC function& operator=(std::nullptr_t)
     {
         delete fn;
         fn = nullptr;
         return *this;
     }
     template <typename Fn>
-    CMT_INTRIN function& operator=(const Fn& x)
+    CMT_MEM_INTRINSIC function& operator=(const Fn& x)
     {
         using FnImpl =
             details::virtual_function_impl<typename details::func_filter<Fn>::type, Result, Args...>;
@@ -116,15 +116,15 @@ struct function<Result(Args...)>
         fn = temp;
         return *this;
     }
-    CMT_INTRIN Result operator()(Args... args) const { return (*fn)(std::forward<Args>(args)...); }
+    CMT_MEM_INTRINSIC Result operator()(Args... args) const { return (*fn)(std::forward<Args>(args)...); }
     template <typename TResult>
-    CMT_INTRIN Result call(TResult&& default_result, Args... args) const
+    CMT_MEM_INTRINSIC Result call(TResult&& default_result, Args... args) const
     {
         return fn ? (*fn)(std::forward<Args>(args)...) : std::forward<TResult>(default_result);
     }
-    CMT_INTRIN explicit operator bool() const noexcept { return !!fn; }
+    CMT_MEM_INTRINSIC explicit operator bool() const CMT_NOEXCEPT { return !!fn; }
 
-    CMT_INTRIN ~function() { delete fn; }
+    CMT_MEM_INTRINSIC ~function() { delete fn; }
 
 private:
     details::virtual_function<Result, Args...>* fn;
diff --git a/include/kfr/cometa/named_arg.hpp b/include/kfr/cometa/named_arg.hpp
@@ -19,10 +19,10 @@ struct named_arg
 
 struct named
 {
-    constexpr named(const char* name) noexcept : name(name) {}
+    constexpr named(const char* name) CMT_NOEXCEPT : name(name) {}
 
     template <typename T>
-    CMT_INTRIN constexpr named_arg<T> operator=(T&& value)
+    CMT_MEM_INTRINSIC constexpr named_arg<T> operator=(T&& value)
     {
         return named_arg<T>{ std::forward<T>(value), name };
     }
diff --git a/include/kfr/cometa/numeric.hpp b/include/kfr/cometa/numeric.hpp
@@ -0,0 +1,194 @@
+/** @addtogroup cometa
+ *  @{
+ */
+#pragma once
+
+#include "../cometa.hpp"
+
+namespace cometa
+{
+
+/// @brief Short names for common types
+using b8   = bool;
+using f32  = float;
+using f64  = double;
+using i8   = int8_t;
+using i16  = int16_t;
+using i32  = int32_t;
+using i64  = int64_t;
+using u8   = uint8_t;
+using u16  = uint16_t;
+using u32  = uint32_t;
+using u64  = uint64_t;
+using umax = uint64_t;
+using imax = int64_t;
+using fmax = double;
+using f80  = long double;
+
+#if defined(CMT_BASETYPE_F32) || defined(CMT_NO_NATIVE_F64)
+using fbase = float;
+#else
+using fbase = double;
+#endif
+
+namespace details
+{
+template <typename T>
+struct fix_type_impl
+{
+    using type = T;
+};
+
+template <>
+struct fix_type_impl<char>
+{
+    using type = i8;
+};
+
+template <>
+struct fix_type_impl<unsigned long>
+{
+#if ULONG_MAX == ULLONG_MAX
+    using type = u64;
+#else
+    using type = u32;
+#endif
+};
+
+template <>
+struct fix_type_impl<signed long>
+{
+#if LONG_MAX == LLONG_MAX
+    using type = i64;
+#else
+    using type = i32;
+#endif
+};
+
+template <>
+struct fix_type_impl<unsigned long long>
+{
+    using type = u64;
+};
+
+template <>
+struct fix_type_impl<signed long long>
+{
+    using type = i64;
+};
+
+} // namespace details
+
+template <typename T>
+using fix_type = typename details::fix_type_impl<T>::type;
+
+/// @brief An enumeration representing data type
+enum class datatype : int
+{
+    typebits_mask  = 0xFF,
+    f              = 0x100, // floating point
+    i              = 0x200, // signed integer
+    u              = 0x300, // unsigned integer
+    c              = 0x400, // complex floating point
+    b              = 0x500, // boolean
+    typeclass_mask = 0xF00,
+    f16            = static_cast<int>(f) | 16,
+    f32            = static_cast<int>(f) | 32,
+    f64            = static_cast<int>(f) | 64,
+    f80            = static_cast<int>(f) | 80,
+    i8             = static_cast<int>(i) | 8,
+    i16            = static_cast<int>(i) | 16,
+    i24            = static_cast<int>(i) | 24,
+    i32            = static_cast<int>(i) | 32,
+    i64            = static_cast<int>(i) | 64,
+    u8             = static_cast<int>(u) | 8,
+    u16            = static_cast<int>(u) | 16,
+    u24            = static_cast<int>(u) | 24,
+    u32            = static_cast<int>(u) | 32,
+    u64            = static_cast<int>(u) | 64,
+    c32            = static_cast<int>(c) | 32,
+    c64            = static_cast<int>(c) | 64,
+    b8             = static_cast<int>(b) | 8
+};
+
+constexpr inline datatype operator|(datatype x, datatype y)
+{
+    using type = underlying_type<datatype>;
+    return static_cast<datatype>(static_cast<type>(x) | static_cast<type>(y));
+}
+
+constexpr inline datatype operator&(datatype x, datatype y)
+{
+    using type = underlying_type<datatype>;
+    return static_cast<datatype>(static_cast<type>(x) & static_cast<type>(y));
+}
+
+template <typename T>
+constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value
+                                   ? datatype::f
+                                   : std::is_integral<typename compound_type_traits<T>::subtype>::value
+                                         ? (std::is_unsigned<typename compound_type_traits<T>::subtype>::value
+                                                ? datatype::u
+                                                : datatype::i)
+                                         : datatype();
+
+template <typename T>
+using is_f_class = std::integral_constant<bool, typeclass<T> == datatype::f>;
+template <typename T>
+using is_u_class = std::integral_constant<bool, typeclass<T> == datatype::u>;
+template <typename T>
+using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>;
+
+template <typename T>
+struct typebits
+{
+    static_assert(is_number<deep_subtype<T>>::value, "");
+    constexpr static size_t bits  = sizeof(typename compound_type_traits<T>::subtype) * 8;
+    constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width;
+    using subtype                 = typename compound_type_traits<T>::subtype;
+};
+
+template <typename T>
+using ftype =
+    typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>;
+template <typename T>
+using itype =
+    typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>;
+template <typename T>
+using utype =
+    typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>;
+
+template <typename T>
+using uitype = conditional<is_i_class<deep_subtype<T>>::value, T, utype<T>>;
+
+template <typename T>
+using fsubtype = ftype<subtype<T>>;
+template <typename T>
+using isubtype = itype<subtype<T>>;
+template <typename T>
+using usubtype = utype<subtype<T>>;
+namespace details
+{
+template <typename T>
+struct flt_type_impl
+{
+    using type = conditional<sizeof(T) <= 2, float, fbase>;
+};
+
+template <>
+struct flt_type_impl<float>
+{
+    using type = float;
+};
+template <>
+struct flt_type_impl<double>
+{
+    using type = double;
+};
+} // namespace details
+
+template <typename T>
+using flt_type = typename cometa::compound_type_traits<T>::template deep_rebind<
+    typename details::flt_type_impl<deep_subtype<T>>::type>;
+
+} // namespace cometa
diff --git a/include/kfr/cometa/range.hpp b/include/kfr/cometa/range.hpp
@@ -19,8 +19,9 @@ struct range
     using const_pointer   = const T*;
     using diff_type       = decltype(std::declval<T>() - std::declval<T>());
 
-    constexpr range(value_type begin, value_type end, diff_type step) noexcept
-        : value_begin(begin), value_end(end), step(step)
+    constexpr range(value_type begin, value_type end, diff_type step) CMT_NOEXCEPT : min(begin),
+                                                                                     max(end),
+                                                                                     step(step)
     {
     }
 
@@ -28,42 +29,44 @@ struct range
     {
         value_type value;
         diff_type step;
-        const_reference operator*() const { return value; }
-        const_pointer operator->() const { return &value; }
-        iterator& operator++()
+        constexpr const_reference operator*() const { return value; }
+        constexpr const_pointer operator->() const { return &value; }
+        constexpr iterator& operator++()
         {
             value += step;
             return *this;
         }
-        iterator operator++(int)
+        constexpr iterator operator++(int)
         {
             iterator copy = *this;
             ++(*this);
             return copy;
         }
-        bool operator!=(const iterator& other) const
+        constexpr bool operator!=(const iterator& other) const
         {
             return step > 0 ? value < other.value : value > other.value;
         }
     };
-    value_type value_begin;
-    value_type value_end;
+    value_type min;
+    value_type max;
     diff_type step;
-    iterator begin() const { return iterator{ value_begin, step }; }
-    iterator end() const { return iterator{ value_end, step }; }
+    constexpr iterator begin() const { return iterator{ min, step }; }
+    constexpr iterator end() const { return iterator{ max, step }; }
+
+    constexpr T distance() const { return max - min; }
 };
 
 /// @brief Make iterable range object
 template <typename T>
-range<T> make_range(T begin, T end)
+constexpr range<T> make_range(T begin, T end)
 {
     return range<T>(begin, end, end > begin ? 1 : -1);
 }
 
 /// @brief Make iterable range object with step
-template <typename T, typename diff_type = decltype(std::declval<T>() - std::declval<T>())>
-range<T> make_range(T begin, T end, diff_type step)
+template <typename T, typename D>
+constexpr range<std::common_type_t<T, D>> make_range(T begin, T end, D step)
 {
-    return range<T>(begin, end, step);
+    return range<std::common_type_t<T, D>>(begin, end, step);
 }
 } // namespace cometa
diff --git a/include/kfr/cometa/result.hpp b/include/kfr/cometa/result.hpp
@@ -20,18 +20,19 @@ struct result
 
     constexpr static error_type ok_value = OkValue;
 
-    constexpr result(const result&)     = default;
-    constexpr result(result&&) noexcept = default;
+    constexpr result(const result&)         = default;
+    constexpr result(result&&) CMT_NOEXCEPT = default;
 
-    constexpr result(ErrEnum error) noexcept : m_error(error) {}
+    constexpr result(ErrEnum error) CMT_NOEXCEPT : m_error(error) {}
 
     template <typename ValueInit, CMT_ENABLE_IF(std::is_constructible<value_type, ValueInit>::value)>
-    constexpr result(ValueInit&& value) noexcept : m_value(std::forward<ValueInit>(value)), m_error(OkValue)
+    constexpr result(ValueInit&& value) CMT_NOEXCEPT : m_value(std::forward<ValueInit>(value)),
+                                                       m_error(OkValue)
     {
     }
 
-    constexpr result(const Type& value) noexcept : m_value(value), m_error(OkValue) {}
-    constexpr result(Type&& value) noexcept : m_value(std::move(value)), m_error(OkValue) {}
+    constexpr result(const Type& value) CMT_NOEXCEPT : m_value(value), m_error(OkValue) {}
+    constexpr result(Type&& value) CMT_NOEXCEPT : m_value(std::move(value)), m_error(OkValue) {}
 
     constexpr explicit operator bool() const { return m_error == OkValue; }
     constexpr const_reference operator*() const { return m_value; }
diff --git a/include/kfr/cometa/string.hpp b/include/kfr/cometa/string.hpp
@@ -27,7 +27,7 @@ template <typename T>
 struct representation
 {
     using type = T;
-    static constexpr const T& get(const T& value) noexcept { return value; }
+    static constexpr const T& get(const T& value) CMT_NOEXCEPT { return value; }
 };
 
 template <typename T>
@@ -175,7 +175,7 @@ CMT_INLINE auto pack_value(const fmt_t<T, t, width, prec>& value)
 }
 
 template <typename T>
-CMT_INLINE auto pack_value(const T& value)
+CMT_INLINE auto pack_value(const T&)
 {
     return pack_value(type_name<T>());
 }
@@ -218,7 +218,7 @@ CMT_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>& 
 template <size_t N1, size_t Nto>
 CMT_INLINE constexpr cstring<N1 - 3 + Nto> fmt_replace(const cstring<N1>& str, const cstring<Nto>& newfmt)
 {
-    return fmt_replace_impl(str, newfmt, csizeseq_t<N1 - 3 + Nto - 1>());
+    return fmt_replace_impl(str, newfmt, csizeseq<N1 - 3 + Nto - 1>);
 }
 
 inline std::string replace_one(const std::string& str, const std::string& from, const std::string& to)
@@ -305,7 +305,7 @@ struct print_t
     }
 };
 
-#ifdef CMT_COMPILER_GNU
+#if defined CMT_COMPILER_GNU && !defined(CMT_COMPILER_INTEL)
 
 template <typename Char, Char... chars>
 constexpr format_t<chars...> operator""_format()
diff --git a/include/kfr/cpuid.hpp b/include/kfr/cpuid.hpp
@@ -1,26 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "cpuid/cpuid.hpp"
-#include "cpuid/cpuid_auto.hpp"
diff --git a/include/kfr/cpuid/cpuid.hpp b/include/kfr/cpuid/cpuid.hpp
@@ -1,297 +0,0 @@
-/** @addtogroup cpuid
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-#include "../base/platform.hpp"
-#include "../base/types.hpp"
-#include <cstring>
-
-namespace kfr
-{
-#ifdef CMT_ARCH_X86
-
-struct cpu_features
-{
-    u32 max;
-    u32 exmax;
-    u32 isIntel : 1;
-    u32 isAMD : 1;
-    u32 has3DNOW : 1;
-    u32 has3DNOWEXT : 1;
-    u32 hasABM : 1;
-    u32 hasADX : 1;
-    u32 hasAES : 1;
-    u32 hasAVX : 1;
-    u32 hasAVX2 : 1;
-    u32 hasAVXOSSUPPORT : 1;
-    u32 hasAVX512OSSUPPORT : 1;
-    u32 hasAVX512CD : 1;
-    u32 hasAVX512ER : 1;
-    u32 hasAVX512F : 1;
-    u32 hasAVX512DQ : 1;
-    u32 hasAVX512PF : 1;
-    u32 hasAVX512BW : 1;
-    u32 hasAVX512VL : 1;
-    u32 hasBMI1 : 1;
-    u32 hasBMI2 : 1;
-    u32 hasCLFSH : 1;
-    u32 hasCMOV : 1;
-    u32 hasCMPXCHG16B : 1;
-    u32 hasCX8 : 1;
-    u32 hasERMS : 1;
-    u32 hasF16C : 1;
-    u32 hasFMA : 1;
-    u32 hasFSGSBASE : 1;
-    u32 hasFXSR : 1;
-    u32 hasHLE : 1;
-    u32 hasINVPCID : 1;
-    u32 hasLAHF : 1;
-    u32 hasLZCNT : 1;
-    u32 hasMMX : 1;
-    u32 hasMMXEXT : 1;
-    u32 hasMONITOR : 1;
-    u32 hasMOVBE : 1;
-    u32 hasMSR : 1;
-    u32 hasOSXSAVE : 1;
-    u32 hasPCLMULQDQ : 1;
-    u32 hasPOPCNT : 1;
-    u32 hasPREFETCHWT1 : 1;
-    u32 hasRDRAND : 1;
-    u32 hasRDSEED : 1;
-    u32 hasRDTSCP : 1;
-    u32 hasRTM : 1;
-    u32 hasSEP : 1;
-    u32 hasSHA : 1;
-    u32 hasSSE : 1;
-    u32 hasSSE2 : 1;
-    u32 hasSSE3 : 1;
-    u32 hasSSE41 : 1;
-    u32 hasSSE42 : 1;
-    u32 hasSSE4a : 1;
-    u32 hasSSSE3 : 1;
-    u32 hasSYSCALL : 1;
-    u32 hasTBM : 1;
-    u32 hasXOP : 1;
-    u32 hasXSAVE : 1;
-    u32 padding1 : 6;
-    char vendor[17];
-    char model[49];
-    char padding2[2];
-};
-
-namespace internal
-{
-
-struct cpu_data
-{
-    u32 data[4];
-};
-
-#if defined CMT_COMPILER_GNU || defined CMT_COMPILER_CLANG
-CMT_INLINE u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx)
-{
-    __asm__("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(func), "2"(subfunc));
-    return 1;
-}
-CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
-{
-    get_cpuid(func, subfunc, &ptr[0], &ptr[1], &ptr[2], &ptr[3]);
-}
-CMT_INLINE u32 get_xcr0()
-{
-    u32 xcr0;
-    __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
-    return xcr0;
-}
-#elif defined CMT_COMPILER_MSVC
-
-CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0) { __cpuidex((int*)ptr, (int)func, (int)subfunc); }
-CMT_INLINE u32 get_xcr0()
-{
-#ifdef _XCR_XFEATURE_ENABLED_MASK
-    unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-    return (u32)Result;
-#else
-    return 0;
-#endif
-}
-#endif
-
-template <size_t = 0>
-cpu_t detect_cpu()
-{
-    cpu_features c;
-    memset(&c, 0, sizeof(c));
-    cpu_data data0;
-    cpu_data exdata0;
-
-    u32 f_1_ECX(0);
-    u32 f_1_EDX(0);
-    u32 f_7_EBX(0);
-    u32 f_7_ECX(0);
-    u32 f_81_ECX(0);
-    u32 f_81_EDX(0);
-
-    cpuid(data0.data, 0);
-    c.max = static_cast<u32>(data0.data[0]);
-    cpuid(exdata0.data, 0x80000000);
-    c.exmax = static_cast<u32>(exdata0.data[0]);
-
-    *ptr_cast<u32>(c.vendor)     = static_cast<u32>(data0.data[1]);
-    *ptr_cast<u32>(c.vendor + 4) = static_cast<u32>(data0.data[3]);
-    *ptr_cast<u32>(c.vendor + 8) = static_cast<u32>(data0.data[2]);
-
-    c.isIntel = strncmp(c.vendor, "GenuineIntel", sizeof(c.vendor)) == 0 ? 1 : 0;
-    c.isAMD   = strncmp(c.vendor, "AuthenticAMD", sizeof(c.vendor)) == 0 ? 1 : 0;
-
-    if (c.max >= 1)
-    {
-        cpu_data data1;
-        cpuid(data1.data, 1);
-        f_1_ECX = static_cast<u32>(data1.data[2]);
-        f_1_EDX = static_cast<u32>(data1.data[3]);
-    }
-
-    if (c.max >= 7)
-    {
-        cpu_data data7;
-        cpuid(data7.data, 7);
-        f_7_EBX = static_cast<u32>(data7.data[1]);
-        f_7_ECX = static_cast<u32>(data7.data[2]);
-    }
-
-    if (c.exmax >= 0x80000001)
-    {
-        cpu_data data81;
-        cpuid(data81.data, 0x80000001);
-        f_81_ECX = static_cast<u32>(data81.data[2]);
-        f_81_EDX = static_cast<u32>(data81.data[3]);
-    }
-
-    if (c.exmax >= 0x80000004)
-    {
-        cpu_data data82;
-        cpu_data data83;
-        cpu_data data84;
-        cpuid(data82.data, 0x80000002);
-        cpuid(data83.data, 0x80000003);
-        cpuid(data84.data, 0x80000004);
-        memcpy(c.model, data82.data, sizeof(cpu_data));
-        memcpy(c.model + 16, data83.data, sizeof(cpu_data));
-        memcpy(c.model + 32, data84.data, sizeof(cpu_data));
-    }
-
-    c.hasSSE3        = f_1_ECX >> 0 & 1;
-    c.hasPCLMULQDQ   = f_1_ECX >> 1 & 1;
-    c.hasMONITOR     = f_1_ECX >> 3 & 1;
-    c.hasSSSE3       = f_1_ECX >> 9 & 1;
-    c.hasFMA         = f_1_ECX >> 12 & 1;
-    c.hasCMPXCHG16B  = f_1_ECX >> 13 & 1;
-    c.hasSSE41       = f_1_ECX >> 19 & 1;
-    c.hasSSE42       = f_1_ECX >> 20 & 1;
-    c.hasMOVBE       = f_1_ECX >> 22 & 1;
-    c.hasPOPCNT      = f_1_ECX >> 23 & 1;
-    c.hasAES         = f_1_ECX >> 25 & 1;
-    c.hasXSAVE       = f_1_ECX >> 26 & 1;
-    c.hasOSXSAVE     = f_1_ECX >> 27 & 1;
-    c.hasAVX         = f_1_ECX >> 28 & 1;
-    c.hasF16C        = f_1_ECX >> 29 & 1;
-    c.hasRDRAND      = f_1_ECX >> 30 & 1;
-    c.hasMSR         = f_1_EDX >> 5 & 1;
-    c.hasCX8         = f_1_EDX >> 8 & 1;
-    c.hasSEP         = f_1_EDX >> 11 & 1;
-    c.hasCMOV        = f_1_EDX >> 15 & 1;
-    c.hasCLFSH       = f_1_EDX >> 19 & 1;
-    c.hasMMX         = f_1_EDX >> 23 & 1;
-    c.hasFXSR        = f_1_EDX >> 24 & 1;
-    c.hasSSE         = f_1_EDX >> 25 & 1;
-    c.hasSSE2        = f_1_EDX >> 26 & 1;
-    c.hasFSGSBASE    = f_7_EBX >> 0 & 1;
-    c.hasBMI1        = f_7_EBX >> 3 & 1;
-    c.hasHLE         = c.isIntel && f_7_EBX >> 4 & 1;
-    c.hasAVX2        = f_7_EBX >> 5 & 1;
-    c.hasBMI2        = f_7_EBX >> 8 & 1;
-    c.hasERMS        = f_7_EBX >> 9 & 1;
-    c.hasINVPCID     = f_7_EBX >> 10 & 1;
-    c.hasRTM         = c.isIntel && f_7_EBX >> 11 & 1;
-    c.hasAVX512F     = f_7_EBX >> 16 & 1;
-    c.hasAVX512DQ    = f_7_EBX >> 17 & 1;
-    c.hasRDSEED      = f_7_EBX >> 18 & 1;
-    c.hasADX         = f_7_EBX >> 19 & 1;
-    c.hasAVX512PF    = f_7_EBX >> 26 & 1;
-    c.hasAVX512ER    = f_7_EBX >> 27 & 1;
-    c.hasAVX512CD    = f_7_EBX >> 28 & 1;
-    c.hasSHA         = f_7_EBX >> 29 & 1;
-    c.hasAVX512BW    = f_7_EBX >> 30 & 1;
-    c.hasAVX512VL    = f_7_EBX >> 31 & 1;
-    c.hasPREFETCHWT1 = f_7_ECX >> 0 & 1;
-    c.hasLAHF        = f_81_ECX >> 0 & 1;
-    c.hasLZCNT       = c.isIntel && f_81_ECX >> 5 & 1;
-    c.hasABM         = c.isAMD && f_81_ECX >> 5 & 1;
-    c.hasSSE4a       = c.isAMD && f_81_ECX >> 6 & 1;
-    c.hasXOP         = c.isAMD && f_81_ECX >> 11 & 1;
-    c.hasTBM         = c.isAMD && f_81_ECX >> 21 & 1;
-    c.hasSYSCALL     = c.isIntel && f_81_EDX >> 11 & 1;
-    c.hasMMXEXT      = c.isAMD && f_81_EDX >> 22 & 1;
-    c.hasRDTSCP      = c.isIntel && f_81_EDX >> 27 & 1;
-    c.has3DNOWEXT    = c.isAMD && f_81_EDX >> 30 & 1;
-    c.has3DNOW       = c.isAMD && f_81_EDX >> 31 & 1;
-
-    c.hasAVXOSSUPPORT    = c.hasAVX && c.hasOSXSAVE && (get_xcr0() & 0x06) == 0x06;
-    c.hasAVX512OSSUPPORT = c.hasAVXOSSUPPORT && c.hasAVX512F && c.hasOSXSAVE && (get_xcr0() & 0xE0) == 0xE0;
-
-    if (c.hasAVX512F && c.hasAVX512CD && c.hasAVX512VL && c.hasAVX512BW && c.hasAVX512DQ &&
-        c.hasAVX512OSSUPPORT)
-        return cpu_t::avx512;
-    if (c.hasAVX2 && c.hasAVXOSSUPPORT)
-        return cpu_t::avx2;
-    if (c.hasAVX && c.hasAVXOSSUPPORT)
-        return cpu_t::avx1;
-    if (c.hasSSE41)
-        return cpu_t::sse41;
-    if (c.hasSSSE3)
-        return cpu_t::ssse3;
-    if (c.hasSSE3)
-        return cpu_t::sse3;
-    if (c.hasSSE2)
-        return cpu_t::sse2;
-    return cpu_t::lowest;
-}
-} // namespace internal
-#else
-
-template <size_t = 0>
-cpu_t detect_cpu()
-{
-    return cpu_t::native;
-}
-
-#endif
-} // namespace kfr
diff --git a/include/kfr/cpuid/cpuid_auto.hpp b/include/kfr/cpuid/cpuid_auto.hpp
@@ -1,60 +0,0 @@
-/** @addtogroup cpuid
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "cpuid.hpp"
-
-namespace kfr
-{
-namespace internal
-{
-
-CMT_INLINE cpu_t& cpu_v()
-{
-    static cpu_t v1 = cpu_t::native;
-    return v1;
-}
-
-CMT_INLINE char init_cpu_v()
-{
-    cpu_v() = detect_cpu<0>();
-    return 0;
-}
-
-CMT_INLINE char init_dummyvar()
-{
-    static char dummy = init_cpu_v();
-    return dummy;
-}
-
-static char dummyvar = init_dummyvar();
-} // namespace internal
-
-/**
- * @brief Returns cpu instruction set detected at runtime.
- */
-CMT_INLINE cpu_t get_cpu() { return internal::cpu_v(); }
-} // namespace kfr
diff --git a/include/kfr/data/sincos.hpp b/include/kfr/data/sincos.hpp
@@ -1,192 +0,0 @@
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../base/kfr.h"
-#include "../base/types.hpp"
-#include <cstdint>
-
-namespace kfr
-{
-
-namespace data
-{
-
-template <typename T>
-constexpr T c_sin_table[65] = {
-    /* sin(2*pi*   0/ 256) */ f32(0.0),
-    /* sin(2*pi*   1/ 256) */ f32(0.02454122852291228803173452945928292506547),
-    /* sin(2*pi*   2/ 256) */ f32(0.04906767432741801425495497694268265831475),
-    /* sin(2*pi*   3/ 256) */ f32(0.0735645635996674235294656215752343218133),
-    /* sin(2*pi*   4/ 256) */ f32(0.09801714032956060199419556388864184586114),
-    /* sin(2*pi*   5/ 256) */ f32(0.1224106751992161984987044741509457875752),
-    /* sin(2*pi*   6/ 256) */ f32(0.1467304744553617516588501296467178197062),
-    /* sin(2*pi*   7/ 256) */ f32(0.1709618887603012263636423572082635319663),
-    /* sin(2*pi*   8/ 256) */ f32(0.1950903220161282678482848684770222409277),
-    /* sin(2*pi*   9/ 256) */ f32(0.2191012401568697972277375474973577988484),
-    /* sin(2*pi*  10/ 256) */ f32(0.242980179903263889948274162077471118321),
-    /* sin(2*pi*  11/ 256) */ f32(0.2667127574748983863252865151164363940421),
-    /* sin(2*pi*  12/ 256) */ f32(0.2902846772544623676361923758173952746915),
-    /* sin(2*pi*  13/ 256) */ f32(0.3136817403988914766564788459941003099934),
-    /* sin(2*pi*  14/ 256) */ f32(0.3368898533922200506892532126191475704778),
-    /* sin(2*pi*  15/ 256) */ f32(0.3598950365349881487751045723267564202023),
-    /* sin(2*pi*  16/ 256) */ f32(0.3826834323650897717284599840303988667613),
-    /* sin(2*pi*  17/ 256) */ f32(0.4052413140049898709084813055050524665119),
-    /* sin(2*pi*  18/ 256) */ f32(0.4275550934302820943209668568887985343046),
-    /* sin(2*pi*  19/ 256) */ f32(0.4496113296546066000462945794242270758832),
-    /* sin(2*pi*  20/ 256) */ f32(0.4713967368259976485563876259052543776575),
-    /* sin(2*pi*  21/ 256) */ f32(0.4928981922297840368730266887588092682397),
-    /* sin(2*pi*  22/ 256) */ f32(0.514102744193221726593693838968815772608),
-    /* sin(2*pi*  23/ 256) */ f32(0.5349976198870972106630769046370179155603),
-    /* sin(2*pi*  24/ 256) */ f32(0.5555702330196022247428308139485328743749),
-    /* sin(2*pi*  25/ 256) */ f32(0.575808191417845300745972453815730841776),
-    /* sin(2*pi*  26/ 256) */ f32(0.5956993044924333434670365288299698895119),
-    /* sin(2*pi*  27/ 256) */ f32(0.6152315905806268454849135634139842776594),
-    /* sin(2*pi*  28/ 256) */ f32(0.6343932841636454982151716132254933706757),
-    /* sin(2*pi*  29/ 256) */ f32(0.6531728429537767640842030136563054150769),
-    /* sin(2*pi*  30/ 256) */ f32(0.6715589548470184006253768504274218032288),
-    /* sin(2*pi*  31/ 256) */ f32(0.6895405447370669246167306299574847028455),
-    /* sin(2*pi*  32/ 256) */ f32(0.7071067811865475244008443621048490392848),
-    /* sin(2*pi*  33/ 256) */ f32(0.7242470829514669209410692432905531674831),
-    /* sin(2*pi*  34/ 256) */ f32(0.740951125354959091175616897495162729729),
-    /* sin(2*pi*  35/ 256) */ f32(0.7572088465064845475754640536057844730404),
-    /* sin(2*pi*  36/ 256) */ f32(0.773010453362736960810906609758469800971),
-    /* sin(2*pi*  37/ 256) */ f32(0.7883464276266062620091647053596892826565),
-    /* sin(2*pi*  38/ 256) */ f32(0.8032075314806449098066765129631419238796),
-    /* sin(2*pi*  39/ 256) */ f32(0.817584813151583696504920884130633809471),
-    /* sin(2*pi*  40/ 256) */ f32(0.8314696123025452370787883776179057567386),
-    /* sin(2*pi*  41/ 256) */ f32(0.8448535652497070732595712051049570977198),
-    /* sin(2*pi*  42/ 256) */ f32(0.8577286100002720699022699842847701370425),
-    /* sin(2*pi*  43/ 256) */ f32(0.8700869911087114186522924044838488439108),
-    /* sin(2*pi*  44/ 256) */ f32(0.8819212643483550297127568636603883495084),
-    /* sin(2*pi*  45/ 256) */ f32(0.8932243011955153203424164474933979780006),
-    /* sin(2*pi*  46/ 256) */ f32(0.9039892931234433315862002972305370487101),
-    /* sin(2*pi*  47/ 256) */ f32(0.9142097557035306546350148293935774010447),
-    /* sin(2*pi*  48/ 256) */ f32(0.9238795325112867561281831893967882868224),
-    /* sin(2*pi*  49/ 256) */ f32(0.932992798834738887711660255543302498295),
-    /* sin(2*pi*  50/ 256) */ f32(0.9415440651830207784125094025995023571856),
-    /* sin(2*pi*  51/ 256) */ f32(0.9495281805930366671959360741893450282522),
-    /* sin(2*pi*  52/ 256) */ f32(0.9569403357322088649357978869802699694828),
-    /* sin(2*pi*  53/ 256) */ f32(0.9637760657954398666864643555078351536631),
-    /* sin(2*pi*  54/ 256) */ f32(0.9700312531945439926039842072861002514569),
-    /* sin(2*pi*  55/ 256) */ f32(0.975702130038528544460395766419527971644),
-    /* sin(2*pi*  56/ 256) */ f32(0.9807852804032304491261822361342390369739),
-    /* sin(2*pi*  57/ 256) */ f32(0.9852776423889412447740184331785477871601),
-    /* sin(2*pi*  58/ 256) */ f32(0.9891765099647809734516737380162430639837),
-    /* sin(2*pi*  59/ 256) */ f32(0.9924795345987099981567672516611178200108),
-    /* sin(2*pi*  60/ 256) */ f32(0.9951847266721968862448369531094799215755),
-    /* sin(2*pi*  61/ 256) */ f32(0.9972904566786902161355971401825678211717),
-    /* sin(2*pi*  62/ 256) */ f32(0.9987954562051723927147716047591006944432),
-    /* sin(2*pi*  63/ 256) */ f32(0.9996988186962042201157656496661721968501),
-    /* sin(2*pi*  64/ 256) */ f32(1.0000000000000000000000000000000000000000)
-};
-
-// data generated by mpfr
-template <>
-constexpr f64 c_sin_table<f64>[65] = {
-    /* sin(2*pi*   0/ 256) */ f64(0.0),
-    /* sin(2*pi*   1/ 256) */ f64(0.02454122852291228803173452945928292506547),
-    /* sin(2*pi*   2/ 256) */ f64(0.04906767432741801425495497694268265831475),
-    /* sin(2*pi*   3/ 256) */ f64(0.0735645635996674235294656215752343218133),
-    /* sin(2*pi*   4/ 256) */ f64(0.09801714032956060199419556388864184586114),
-    /* sin(2*pi*   5/ 256) */ f64(0.1224106751992161984987044741509457875752),
-    /* sin(2*pi*   6/ 256) */ f64(0.1467304744553617516588501296467178197062),
-    /* sin(2*pi*   7/ 256) */ f64(0.1709618887603012263636423572082635319663),
-    /* sin(2*pi*   8/ 256) */ f64(0.1950903220161282678482848684770222409277),
-    /* sin(2*pi*   9/ 256) */ f64(0.2191012401568697972277375474973577988484),
-    /* sin(2*pi*  10/ 256) */ f64(0.242980179903263889948274162077471118321),
-    /* sin(2*pi*  11/ 256) */ f64(0.2667127574748983863252865151164363940421),
-    /* sin(2*pi*  12/ 256) */ f64(0.2902846772544623676361923758173952746915),
-    /* sin(2*pi*  13/ 256) */ f64(0.3136817403988914766564788459941003099934),
-    /* sin(2*pi*  14/ 256) */ f64(0.3368898533922200506892532126191475704778),
-    /* sin(2*pi*  15/ 256) */ f64(0.3598950365349881487751045723267564202023),
-    /* sin(2*pi*  16/ 256) */ f64(0.3826834323650897717284599840303988667613),
-    /* sin(2*pi*  17/ 256) */ f64(0.4052413140049898709084813055050524665119),
-    /* sin(2*pi*  18/ 256) */ f64(0.4275550934302820943209668568887985343046),
-    /* sin(2*pi*  19/ 256) */ f64(0.4496113296546066000462945794242270758832),
-    /* sin(2*pi*  20/ 256) */ f64(0.4713967368259976485563876259052543776575),
-    /* sin(2*pi*  21/ 256) */ f64(0.4928981922297840368730266887588092682397),
-    /* sin(2*pi*  22/ 256) */ f64(0.514102744193221726593693838968815772608),
-    /* sin(2*pi*  23/ 256) */ f64(0.5349976198870972106630769046370179155603),
-    /* sin(2*pi*  24/ 256) */ f64(0.5555702330196022247428308139485328743749),
-    /* sin(2*pi*  25/ 256) */ f64(0.575808191417845300745972453815730841776),
-    /* sin(2*pi*  26/ 256) */ f64(0.5956993044924333434670365288299698895119),
-    /* sin(2*pi*  27/ 256) */ f64(0.6152315905806268454849135634139842776594),
-    /* sin(2*pi*  28/ 256) */ f64(0.6343932841636454982151716132254933706757),
-    /* sin(2*pi*  29/ 256) */ f64(0.6531728429537767640842030136563054150769),
-    /* sin(2*pi*  30/ 256) */ f64(0.6715589548470184006253768504274218032288),
-    /* sin(2*pi*  31/ 256) */ f64(0.6895405447370669246167306299574847028455),
-    /* sin(2*pi*  32/ 256) */ f64(0.7071067811865475244008443621048490392848),
-    /* sin(2*pi*  33/ 256) */ f64(0.7242470829514669209410692432905531674831),
-    /* sin(2*pi*  34/ 256) */ f64(0.740951125354959091175616897495162729729),
-    /* sin(2*pi*  35/ 256) */ f64(0.7572088465064845475754640536057844730404),
-    /* sin(2*pi*  36/ 256) */ f64(0.773010453362736960810906609758469800971),
-    /* sin(2*pi*  37/ 256) */ f64(0.7883464276266062620091647053596892826565),
-    /* sin(2*pi*  38/ 256) */ f64(0.8032075314806449098066765129631419238796),
-    /* sin(2*pi*  39/ 256) */ f64(0.817584813151583696504920884130633809471),
-    /* sin(2*pi*  40/ 256) */ f64(0.8314696123025452370787883776179057567386),
-    /* sin(2*pi*  41/ 256) */ f64(0.8448535652497070732595712051049570977198),
-    /* sin(2*pi*  42/ 256) */ f64(0.8577286100002720699022699842847701370425),
-    /* sin(2*pi*  43/ 256) */ f64(0.8700869911087114186522924044838488439108),
-    /* sin(2*pi*  44/ 256) */ f64(0.8819212643483550297127568636603883495084),
-    /* sin(2*pi*  45/ 256) */ f64(0.8932243011955153203424164474933979780006),
-    /* sin(2*pi*  46/ 256) */ f64(0.9039892931234433315862002972305370487101),
-    /* sin(2*pi*  47/ 256) */ f64(0.9142097557035306546350148293935774010447),
-    /* sin(2*pi*  48/ 256) */ f64(0.9238795325112867561281831893967882868224),
-    /* sin(2*pi*  49/ 256) */ f64(0.932992798834738887711660255543302498295),
-    /* sin(2*pi*  50/ 256) */ f64(0.9415440651830207784125094025995023571856),
-    /* sin(2*pi*  51/ 256) */ f64(0.9495281805930366671959360741893450282522),
-    /* sin(2*pi*  52/ 256) */ f64(0.9569403357322088649357978869802699694828),
-    /* sin(2*pi*  53/ 256) */ f64(0.9637760657954398666864643555078351536631),
-    /* sin(2*pi*  54/ 256) */ f64(0.9700312531945439926039842072861002514569),
-    /* sin(2*pi*  55/ 256) */ f64(0.975702130038528544460395766419527971644),
-    /* sin(2*pi*  56/ 256) */ f64(0.9807852804032304491261822361342390369739),
-    /* sin(2*pi*  57/ 256) */ f64(0.9852776423889412447740184331785477871601),
-    /* sin(2*pi*  58/ 256) */ f64(0.9891765099647809734516737380162430639837),
-    /* sin(2*pi*  59/ 256) */ f64(0.9924795345987099981567672516611178200108),
-    /* sin(2*pi*  60/ 256) */ f64(0.9951847266721968862448369531094799215755),
-    /* sin(2*pi*  61/ 256) */ f64(0.9972904566786902161355971401825678211717),
-    /* sin(2*pi*  62/ 256) */ f64(0.9987954562051723927147716047591006944432),
-    /* sin(2*pi*  63/ 256) */ f64(0.9996988186962042201157656496661721968501),
-    /* sin(2*pi*  64/ 256) */ f64(1.0000000000000000000000000000000000000000)
-};
-
-} // namespace data
-
-template <typename T>
-constexpr inline T sin_using_table_256(size_t k)
-{
-    return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128];
-}
-
-template <typename T>
-constexpr inline T sin_using_table(size_t size, size_t k)
-{
-    return sin_using_table_256<T>((k * 256 / size) % 256);
-}
-template <typename T>
-constexpr inline T cos_using_table(size_t size, size_t k)
-{
-    return sin_using_table<T>(size, k + size / 4);
-}
-} // namespace kfr
diff --git a/include/kfr/dft/cache.hpp b/include/kfr/dft/cache.hpp
@@ -32,6 +32,8 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 template <typename T>
 using dft_plan_ptr = std::shared_ptr<const dft_plan<T>>;
@@ -166,4 +168,5 @@ univector<T> irealdft(const univector<complex<T>, Tag>& input)
     dft->execute(output, input, temp);
     return output;
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dft/convolution.hpp b/include/kfr/dft/convolution.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dft
+/** @addtogroup convolution
  *  @{
  */
 /*
@@ -25,12 +25,12 @@
  */
 #pragma once
 
-#include "../base/complex.hpp"
-#include "../base/constants.hpp"
 #include "../base/filter.hpp"
 #include "../base/memory.hpp"
-#include "../base/read_write.hpp"
-#include "../base/vec.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/constants.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/vec.hpp"
 
 #include "cache.hpp"
 #include "fft.hpp"
@@ -42,8 +42,10 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
-namespace internal
+namespace intrinsics
 {
 template <typename T>
 univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
@@ -51,27 +53,27 @@ template <typename T>
 univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
 template <typename T>
 univector<T> autocorrelate(const univector_ref<const T>& src1);
-} // namespace internal
+} // namespace intrinsics
 
 /// @brief Convolution
 template <typename T, univector_tag Tag1, univector_tag Tag2>
 univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
 {
-    return internal::convolve(src1.slice(), src2.slice());
+    return intrinsics::convolve(src1.slice(), src2.slice());
 }
 
 /// @brief Correlation
 template <typename T, univector_tag Tag1, univector_tag Tag2>
 univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
 {
-    return internal::correlate(src1.slice(), src2.slice());
+    return intrinsics::correlate(src1.slice(), src2.slice());
 }
 
 /// @brief Auto-correlation
 template <typename T, univector_tag Tag1>
 univector<T> autocorrelate(const univector<T, Tag1>& src)
 {
-    return internal::autocorrelate(src.slice());
+    return intrinsics::autocorrelate(src.slice());
 }
 
 /// @brief Convolution using Filter API
@@ -91,12 +93,12 @@ protected:
     }
     void process_buffer(T* output, const T* input, size_t size) final;
 
+    const size_t size;
+    const size_t block_size;
     const dft_plan_real<T> fft;
     univector<u8> temp;
     std::vector<univector<complex<T>>> segments;
     std::vector<univector<complex<T>>> ir_segments;
-    const size_t size;
-    const size_t block_size;
     size_t input_position;
     univector<T> saved_input;
     univector<complex<T>> premul;
@@ -105,6 +107,6 @@ protected:
     univector<T> overlap;
     size_t position;
 };
-
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
 CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/data/bitrev.hpp b/include/kfr/dft/data/bitrev.hpp
diff --git a/include/kfr/dft/data/sincos.hpp b/include/kfr/dft/data/sincos.hpp
@@ -0,0 +1,192 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../kfr.h"
+#include "../../simd/types.hpp"
+#include <cstdint>
+
+namespace kfr
+{
+
+namespace data
+{
+
+template <typename T>
+constexpr T c_sin_table[65] = {
+    /* sin(2*pi*   0/ 256) */ f32(0.0),
+    /* sin(2*pi*   1/ 256) */ f32(0.02454122852291228803173452945928292506547),
+    /* sin(2*pi*   2/ 256) */ f32(0.04906767432741801425495497694268265831475),
+    /* sin(2*pi*   3/ 256) */ f32(0.0735645635996674235294656215752343218133),
+    /* sin(2*pi*   4/ 256) */ f32(0.09801714032956060199419556388864184586114),
+    /* sin(2*pi*   5/ 256) */ f32(0.1224106751992161984987044741509457875752),
+    /* sin(2*pi*   6/ 256) */ f32(0.1467304744553617516588501296467178197062),
+    /* sin(2*pi*   7/ 256) */ f32(0.1709618887603012263636423572082635319663),
+    /* sin(2*pi*   8/ 256) */ f32(0.1950903220161282678482848684770222409277),
+    /* sin(2*pi*   9/ 256) */ f32(0.2191012401568697972277375474973577988484),
+    /* sin(2*pi*  10/ 256) */ f32(0.242980179903263889948274162077471118321),
+    /* sin(2*pi*  11/ 256) */ f32(0.2667127574748983863252865151164363940421),
+    /* sin(2*pi*  12/ 256) */ f32(0.2902846772544623676361923758173952746915),
+    /* sin(2*pi*  13/ 256) */ f32(0.3136817403988914766564788459941003099934),
+    /* sin(2*pi*  14/ 256) */ f32(0.3368898533922200506892532126191475704778),
+    /* sin(2*pi*  15/ 256) */ f32(0.3598950365349881487751045723267564202023),
+    /* sin(2*pi*  16/ 256) */ f32(0.3826834323650897717284599840303988667613),
+    /* sin(2*pi*  17/ 256) */ f32(0.4052413140049898709084813055050524665119),
+    /* sin(2*pi*  18/ 256) */ f32(0.4275550934302820943209668568887985343046),
+    /* sin(2*pi*  19/ 256) */ f32(0.4496113296546066000462945794242270758832),
+    /* sin(2*pi*  20/ 256) */ f32(0.4713967368259976485563876259052543776575),
+    /* sin(2*pi*  21/ 256) */ f32(0.4928981922297840368730266887588092682397),
+    /* sin(2*pi*  22/ 256) */ f32(0.514102744193221726593693838968815772608),
+    /* sin(2*pi*  23/ 256) */ f32(0.5349976198870972106630769046370179155603),
+    /* sin(2*pi*  24/ 256) */ f32(0.5555702330196022247428308139485328743749),
+    /* sin(2*pi*  25/ 256) */ f32(0.575808191417845300745972453815730841776),
+    /* sin(2*pi*  26/ 256) */ f32(0.5956993044924333434670365288299698895119),
+    /* sin(2*pi*  27/ 256) */ f32(0.6152315905806268454849135634139842776594),
+    /* sin(2*pi*  28/ 256) */ f32(0.6343932841636454982151716132254933706757),
+    /* sin(2*pi*  29/ 256) */ f32(0.6531728429537767640842030136563054150769),
+    /* sin(2*pi*  30/ 256) */ f32(0.6715589548470184006253768504274218032288),
+    /* sin(2*pi*  31/ 256) */ f32(0.6895405447370669246167306299574847028455),
+    /* sin(2*pi*  32/ 256) */ f32(0.7071067811865475244008443621048490392848),
+    /* sin(2*pi*  33/ 256) */ f32(0.7242470829514669209410692432905531674831),
+    /* sin(2*pi*  34/ 256) */ f32(0.740951125354959091175616897495162729729),
+    /* sin(2*pi*  35/ 256) */ f32(0.7572088465064845475754640536057844730404),
+    /* sin(2*pi*  36/ 256) */ f32(0.773010453362736960810906609758469800971),
+    /* sin(2*pi*  37/ 256) */ f32(0.7883464276266062620091647053596892826565),
+    /* sin(2*pi*  38/ 256) */ f32(0.8032075314806449098066765129631419238796),
+    /* sin(2*pi*  39/ 256) */ f32(0.817584813151583696504920884130633809471),
+    /* sin(2*pi*  40/ 256) */ f32(0.8314696123025452370787883776179057567386),
+    /* sin(2*pi*  41/ 256) */ f32(0.8448535652497070732595712051049570977198),
+    /* sin(2*pi*  42/ 256) */ f32(0.8577286100002720699022699842847701370425),
+    /* sin(2*pi*  43/ 256) */ f32(0.8700869911087114186522924044838488439108),
+    /* sin(2*pi*  44/ 256) */ f32(0.8819212643483550297127568636603883495084),
+    /* sin(2*pi*  45/ 256) */ f32(0.8932243011955153203424164474933979780006),
+    /* sin(2*pi*  46/ 256) */ f32(0.9039892931234433315862002972305370487101),
+    /* sin(2*pi*  47/ 256) */ f32(0.9142097557035306546350148293935774010447),
+    /* sin(2*pi*  48/ 256) */ f32(0.9238795325112867561281831893967882868224),
+    /* sin(2*pi*  49/ 256) */ f32(0.932992798834738887711660255543302498295),
+    /* sin(2*pi*  50/ 256) */ f32(0.9415440651830207784125094025995023571856),
+    /* sin(2*pi*  51/ 256) */ f32(0.9495281805930366671959360741893450282522),
+    /* sin(2*pi*  52/ 256) */ f32(0.9569403357322088649357978869802699694828),
+    /* sin(2*pi*  53/ 256) */ f32(0.9637760657954398666864643555078351536631),
+    /* sin(2*pi*  54/ 256) */ f32(0.9700312531945439926039842072861002514569),
+    /* sin(2*pi*  55/ 256) */ f32(0.975702130038528544460395766419527971644),
+    /* sin(2*pi*  56/ 256) */ f32(0.9807852804032304491261822361342390369739),
+    /* sin(2*pi*  57/ 256) */ f32(0.9852776423889412447740184331785477871601),
+    /* sin(2*pi*  58/ 256) */ f32(0.9891765099647809734516737380162430639837),
+    /* sin(2*pi*  59/ 256) */ f32(0.9924795345987099981567672516611178200108),
+    /* sin(2*pi*  60/ 256) */ f32(0.9951847266721968862448369531094799215755),
+    /* sin(2*pi*  61/ 256) */ f32(0.9972904566786902161355971401825678211717),
+    /* sin(2*pi*  62/ 256) */ f32(0.9987954562051723927147716047591006944432),
+    /* sin(2*pi*  63/ 256) */ f32(0.9996988186962042201157656496661721968501),
+    /* sin(2*pi*  64/ 256) */ f32(1.0000000000000000000000000000000000000000)
+};
+
+// data generated by mpfr
+template <>
+constexpr f64 c_sin_table<f64>[65] = {
+    /* sin(2*pi*   0/ 256) */ f64(0.0),
+    /* sin(2*pi*   1/ 256) */ f64(0.02454122852291228803173452945928292506547),
+    /* sin(2*pi*   2/ 256) */ f64(0.04906767432741801425495497694268265831475),
+    /* sin(2*pi*   3/ 256) */ f64(0.0735645635996674235294656215752343218133),
+    /* sin(2*pi*   4/ 256) */ f64(0.09801714032956060199419556388864184586114),
+    /* sin(2*pi*   5/ 256) */ f64(0.1224106751992161984987044741509457875752),
+    /* sin(2*pi*   6/ 256) */ f64(0.1467304744553617516588501296467178197062),
+    /* sin(2*pi*   7/ 256) */ f64(0.1709618887603012263636423572082635319663),
+    /* sin(2*pi*   8/ 256) */ f64(0.1950903220161282678482848684770222409277),
+    /* sin(2*pi*   9/ 256) */ f64(0.2191012401568697972277375474973577988484),
+    /* sin(2*pi*  10/ 256) */ f64(0.242980179903263889948274162077471118321),
+    /* sin(2*pi*  11/ 256) */ f64(0.2667127574748983863252865151164363940421),
+    /* sin(2*pi*  12/ 256) */ f64(0.2902846772544623676361923758173952746915),
+    /* sin(2*pi*  13/ 256) */ f64(0.3136817403988914766564788459941003099934),
+    /* sin(2*pi*  14/ 256) */ f64(0.3368898533922200506892532126191475704778),
+    /* sin(2*pi*  15/ 256) */ f64(0.3598950365349881487751045723267564202023),
+    /* sin(2*pi*  16/ 256) */ f64(0.3826834323650897717284599840303988667613),
+    /* sin(2*pi*  17/ 256) */ f64(0.4052413140049898709084813055050524665119),
+    /* sin(2*pi*  18/ 256) */ f64(0.4275550934302820943209668568887985343046),
+    /* sin(2*pi*  19/ 256) */ f64(0.4496113296546066000462945794242270758832),
+    /* sin(2*pi*  20/ 256) */ f64(0.4713967368259976485563876259052543776575),
+    /* sin(2*pi*  21/ 256) */ f64(0.4928981922297840368730266887588092682397),
+    /* sin(2*pi*  22/ 256) */ f64(0.514102744193221726593693838968815772608),
+    /* sin(2*pi*  23/ 256) */ f64(0.5349976198870972106630769046370179155603),
+    /* sin(2*pi*  24/ 256) */ f64(0.5555702330196022247428308139485328743749),
+    /* sin(2*pi*  25/ 256) */ f64(0.575808191417845300745972453815730841776),
+    /* sin(2*pi*  26/ 256) */ f64(0.5956993044924333434670365288299698895119),
+    /* sin(2*pi*  27/ 256) */ f64(0.6152315905806268454849135634139842776594),
+    /* sin(2*pi*  28/ 256) */ f64(0.6343932841636454982151716132254933706757),
+    /* sin(2*pi*  29/ 256) */ f64(0.6531728429537767640842030136563054150769),
+    /* sin(2*pi*  30/ 256) */ f64(0.6715589548470184006253768504274218032288),
+    /* sin(2*pi*  31/ 256) */ f64(0.6895405447370669246167306299574847028455),
+    /* sin(2*pi*  32/ 256) */ f64(0.7071067811865475244008443621048490392848),
+    /* sin(2*pi*  33/ 256) */ f64(0.7242470829514669209410692432905531674831),
+    /* sin(2*pi*  34/ 256) */ f64(0.740951125354959091175616897495162729729),
+    /* sin(2*pi*  35/ 256) */ f64(0.7572088465064845475754640536057844730404),
+    /* sin(2*pi*  36/ 256) */ f64(0.773010453362736960810906609758469800971),
+    /* sin(2*pi*  37/ 256) */ f64(0.7883464276266062620091647053596892826565),
+    /* sin(2*pi*  38/ 256) */ f64(0.8032075314806449098066765129631419238796),
+    /* sin(2*pi*  39/ 256) */ f64(0.817584813151583696504920884130633809471),
+    /* sin(2*pi*  40/ 256) */ f64(0.8314696123025452370787883776179057567386),
+    /* sin(2*pi*  41/ 256) */ f64(0.8448535652497070732595712051049570977198),
+    /* sin(2*pi*  42/ 256) */ f64(0.8577286100002720699022699842847701370425),
+    /* sin(2*pi*  43/ 256) */ f64(0.8700869911087114186522924044838488439108),
+    /* sin(2*pi*  44/ 256) */ f64(0.8819212643483550297127568636603883495084),
+    /* sin(2*pi*  45/ 256) */ f64(0.8932243011955153203424164474933979780006),
+    /* sin(2*pi*  46/ 256) */ f64(0.9039892931234433315862002972305370487101),
+    /* sin(2*pi*  47/ 256) */ f64(0.9142097557035306546350148293935774010447),
+    /* sin(2*pi*  48/ 256) */ f64(0.9238795325112867561281831893967882868224),
+    /* sin(2*pi*  49/ 256) */ f64(0.932992798834738887711660255543302498295),
+    /* sin(2*pi*  50/ 256) */ f64(0.9415440651830207784125094025995023571856),
+    /* sin(2*pi*  51/ 256) */ f64(0.9495281805930366671959360741893450282522),
+    /* sin(2*pi*  52/ 256) */ f64(0.9569403357322088649357978869802699694828),
+    /* sin(2*pi*  53/ 256) */ f64(0.9637760657954398666864643555078351536631),
+    /* sin(2*pi*  54/ 256) */ f64(0.9700312531945439926039842072861002514569),
+    /* sin(2*pi*  55/ 256) */ f64(0.975702130038528544460395766419527971644),
+    /* sin(2*pi*  56/ 256) */ f64(0.9807852804032304491261822361342390369739),
+    /* sin(2*pi*  57/ 256) */ f64(0.9852776423889412447740184331785477871601),
+    /* sin(2*pi*  58/ 256) */ f64(0.9891765099647809734516737380162430639837),
+    /* sin(2*pi*  59/ 256) */ f64(0.9924795345987099981567672516611178200108),
+    /* sin(2*pi*  60/ 256) */ f64(0.9951847266721968862448369531094799215755),
+    /* sin(2*pi*  61/ 256) */ f64(0.9972904566786902161355971401825678211717),
+    /* sin(2*pi*  62/ 256) */ f64(0.9987954562051723927147716047591006944432),
+    /* sin(2*pi*  63/ 256) */ f64(0.9996988186962042201157656496661721968501),
+    /* sin(2*pi*  64/ 256) */ f64(1.0000000000000000000000000000000000000000)
+};
+
+} // namespace data
+
+template <typename T>
+constexpr inline T sin_using_table_256(size_t k)
+{
+    return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128];
+}
+
+template <typename T>
+constexpr inline T sin_using_table(size_t size, size_t k)
+{
+    return sin_using_table_256<T>((k * 256 / size) % 256);
+}
+template <typename T>
+constexpr inline T cos_using_table(size_t size, size_t k)
+{
+    return sin_using_table<T>(size, k + size / 4);
+}
+} // namespace kfr
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -25,13 +25,13 @@
  */
 #pragma once
 
-#include "../base/complex.hpp"
-#include "../base/constants.hpp"
 #include "../base/memory.hpp"
-#include "../base/read_write.hpp"
 #include "../base/small_buffer.hpp"
 #include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/constants.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/vec.hpp"
 
 CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wshadow")
@@ -57,9 +57,12 @@ enum class dft_type
 enum class dft_order
 {
     normal,
-    internal, // possibly bit/digit-reversed, implementation-defined, faster
+    internal, // possibly bit/digit-reversed, implementation-defined, faster to compute
 };
 
+inline namespace CMT_ARCH_NAME
+{
+
 template <typename T>
 struct dft_stage;
 
@@ -76,7 +79,8 @@ struct dft_plan
 
     void dump() const;
 
-    KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const
+    KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
+                                   bool inverse = false) const
     {
         if (inverse)
             execute_dft(ctrue, out, in, temp);
@@ -85,14 +89,15 @@ struct dft_plan
     }
     ~dft_plan();
     template <bool inverse>
-    KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, cbool_t<inverse> inv) const
+    KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
+                                   cbool_t<inverse> inv) const
     {
         execute_dft(inv, out, in, temp);
     }
 
     template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
-    KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
-                            univector<u8, Tag3>& temp, bool inverse = false) const
+    KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
+                                   univector<u8, Tag3>& temp, bool inverse = false) const
     {
         if (inverse)
             execute_dft(ctrue, out.data(), in.data(), temp.data());
@@ -100,8 +105,8 @@ struct dft_plan
             execute_dft(cfalse, out.data(), in.data(), temp.data());
     }
     template <bool inverse, univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
-    KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
-                            univector<u8, Tag3>& temp, cbool_t<inverse> inv) const
+    KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
+                                   univector<u8, Tag3>& temp, cbool_t<inverse> inv) const
     {
         execute_dft(inv, out.data(), in.data(), temp.data());
     }
@@ -128,6 +133,9 @@ protected:
     const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
                                 const complex<T>* scratch, bool in_scratch) const;
     complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const;
+
+    void init_dft(size_t size, dft_order order);
+    void init_fft(size_t size, dft_order order);
 };
 
 enum class dft_pack_format
@@ -155,14 +163,14 @@ struct dft_plan_real : dft_plan<T>
     void execute(univector<complex<T>, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&,
                  cbool_t<inverse>) const = delete;
 
-    KFR_INTRIN void execute(complex<T>* out, const T* in, u8* temp,
-                            dft_pack_format fmt = dft_pack_format::CCs) const
+    KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp,
+                                   dft_pack_format fmt = dft_pack_format::CCs) const
     {
         this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp);
         to_fmt(out, fmt);
     }
-    KFR_INTRIN void execute(T* out, const complex<T>* in, u8* temp,
-                            dft_pack_format fmt = dft_pack_format::CCs) const
+    KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp,
+                                   dft_pack_format fmt = dft_pack_format::CCs) const
     {
         complex<T>* outdata = ptr_cast<complex<T>>(out);
         from_fmt(outdata, in, fmt);
@@ -170,15 +178,17 @@ struct dft_plan_real : dft_plan<T>
     }
 
     template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
-    KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in,
-                            univector<u8, Tag3>& temp, dft_pack_format fmt = dft_pack_format::CCs) const
+    KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in,
+                                   univector<u8, Tag3>& temp,
+                                   dft_pack_format fmt = dft_pack_format::CCs) const
     {
         this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data());
         to_fmt(out.data(), fmt);
     }
     template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
-    KFR_INTRIN void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
-                            univector<u8, Tag3>& temp, dft_pack_format fmt = dft_pack_format::CCs) const
+    KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
+                                   univector<u8, Tag3>& temp,
+                                   dft_pack_format fmt = dft_pack_format::CCs) const
     {
         complex<T>* outdata = ptr_cast<complex<T>>(out.data());
         from_fmt(outdata, in.data(), fmt);
@@ -230,6 +240,7 @@ void fft_multiply_accumulate(univector<complex<T>, Tag1>& dest, const univector<
     if (fmt == dft_pack_format::Perm)
         dest[0] = f0;
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
 
 CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp
@@ -25,19 +25,21 @@
  */
 #pragma once
 
-#include "../../base/complex.hpp"
-#include "../../base/constants.hpp"
-#include "../../base/digitreverse.hpp"
-#include "../../base/vec.hpp"
+#include "../../simd/complex.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/digitreverse.hpp"
+#include "../../simd/vec.hpp"
 
-#include "../../data/bitrev.hpp"
+#include "../data/bitrev.hpp"
 
 #include "ft.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
-namespace internal
+namespace intrinsics
 {
 
 constexpr bool fft_reorder_aligned = false;
@@ -74,7 +76,7 @@ CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits)
 }
 
 template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(T* inout, size_t i)
+KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i)
 {
     using cxx           = cvec<T, 16>;
     constexpr size_t N  = 1 << log2n;
@@ -86,7 +88,7 @@ KFR_INTRIN void fft_reorder_swap(T* inout, size_t i)
 }
 
 template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j)
+KFR_INTRINSIC void fft_reorder_swap_two(T* inout, size_t i, size_t j)
 {
     CMT_ASSUME(i != j);
     using cxx           = cvec<T, 16>;
@@ -103,7 +105,7 @@ KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j)
 }
 
 template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(T* inout, size_t i, size_t j)
+KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i, size_t j)
 {
     CMT_ASSUME(i != j);
     using cxx           = cvec<T, 16>;
@@ -120,25 +122,25 @@ KFR_INTRIN void fft_reorder_swap(T* inout, size_t i, size_t j)
 }
 
 template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(complex<T>* inout, size_t i)
+KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i)
 {
     fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2);
 }
 
 template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1)
+KFR_INTRINSIC void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1)
 {
     fft_reorder_swap_two<log2n, bitrev>(ptr_cast<T>(inout), i0 * 2, i1 * 2);
 }
 
 template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(complex<T>* inout, size_t i, size_t j)
+KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i, size_t j)
 {
     fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2, j * 2);
 }
 
 template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<11>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<11>)
 {
     fft_reorder_swap_two<11>(inout, 0 * 4, 8 * 4);
     fft_reorder_swap<11>(inout, 1 * 4, 64 * 4);
@@ -207,7 +209,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<11>)
 }
 
 template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<7>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>)
 {
     constexpr size_t bitrev = 2;
     fft_reorder_swap_two<7, bitrev>(inout, 0 * 4, 2 * 4);
@@ -217,7 +219,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<7>)
 }
 
 template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<8>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>)
 {
     constexpr size_t bitrev = 4;
     fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4);
@@ -231,7 +233,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<8>)
 }
 
 template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>)
 {
     constexpr size_t bitrev = 2;
     fft_reorder_swap_two<9, bitrev>(inout, 0 * 4, 4 * 4);
@@ -253,14 +255,14 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>)
 }
 
 template <typename T, bool use_br2>
-void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
+KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
 {
     cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4,
                                             digitreverse<(use_br2 ? 2 : 4), 2>(value));
 }
 
 template <typename T, bool use_br2>
-KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>)
+KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>)
 {
     CMT_ASSUME(i != j);
     const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4);
@@ -270,7 +272,7 @@ KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbo
 }
 
 template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
 {
     const size_t N         = 1 << log2n;
     const size_t N4        = N / 4;
@@ -305,7 +307,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
 }
 
 template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
 {
     const size_t N         = size_t(1) << log2n;
     const size_t N4        = N / 4;
@@ -386,5 +388,6 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
         i += istep;
     }
 }
-} // namespace internal
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp
@@ -27,8 +27,10 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
-namespace internal
+namespace intrinsics
 {
 
 template <typename T>
@@ -76,18 +78,19 @@ univector<T> autocorrelate(const univector_ref<const T>& src1)
     return result;
 }
 
-} // namespace internal
+} // namespace intrinsics
 
 template <typename T>
 convolve_filter<T>::convolve_filter(size_t size, size_t block_size)
-    : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size),
+    : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size)), temp(fft.temp_size),
       segments((size + block_size - 1) / block_size)
+
 {
 }
 
 template <typename T>
 convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size)
-    : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)),
+    : size(data.size()), block_size(next_poweroftwo(block_size)), fft(2 * next_poweroftwo(block_size)),
       temp(fft.temp_size),
       segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
       ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
@@ -124,8 +127,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
     while (processed < size)
     {
         const size_t processing = std::min(size - processed, block_size - input_position);
-        internal::builtin_memcpy(saved_input.data() + input_position, input + processed,
-                                 processing * sizeof(T));
+        builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T));
 
         process(scratch, padded(saved_input));
         fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
@@ -152,7 +154,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
             input_position = 0;
             process(saved_input, zeros());
 
-            internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
+            builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
 
             position = position > 0 ? position - 1 : segments.size() - 1;
         }
@@ -161,7 +163,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
     }
 }
 
-namespace internal
+namespace intrinsics
 {
 
 template univector<float> convolve<float>(const univector_ref<const float>&,
@@ -171,7 +173,7 @@ template univector<float> correlate<float>(const univector_ref<const float>&,
 
 template univector<float> autocorrelate<float>(const univector_ref<const float>&);
 
-} // namespace internal
+} // namespace intrinsics
 
 template convolve_filter<float>::convolve_filter(size_t, size_t);
 
@@ -181,7 +183,7 @@ template void convolve_filter<float>::set_data(const univector<float>&);
 
 template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size);
 
-namespace internal
+namespace intrinsics
 {
 
 template univector<double> convolve<double>(const univector_ref<const double>&,
@@ -191,7 +193,7 @@ template univector<double> correlate<double>(const univector_ref<const double>&,
 
 template univector<double> autocorrelate<double>(const univector_ref<const double>&);
 
-} // namespace internal
+} // namespace intrinsics
 
 template convolve_filter<double>::convolve_filter(size_t, size_t);
 
@@ -200,5 +202,5 @@ template convolve_filter<double>::convolve_filter(const univector<double>&, size
 template void convolve_filter<double>::set_data(const univector<double>&);
 
 template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size);
-
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp
@@ -0,0 +1,123 @@
+/** @addtogroup dft
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../dft_c.h"
+
+#include "../../base/basic_expressions.hpp"
+#include "../../math/complex_math.hpp"
+#include "../../testo/assert.hpp"
+#include "../cache.hpp"
+#include "../fft.hpp"
+#include "bitrev.hpp"
+#include "ft.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+#define DFT_ASSERT TESTO_ASSERT_INACTIVE
+
+template <typename T>
+constexpr size_t fft_vector_width = vector_width<T>;
+
+using cdirect_t = cfalse_t;
+using cinvert_t = ctrue_t;
+
+template <typename T>
+struct dft_stage
+{
+    size_t radix      = 0;
+    size_t stage_size = 0;
+    size_t data_size  = 0;
+    size_t temp_size  = 0;
+    u8* data          = nullptr;
+    size_t repeats    = 1;
+    size_t out_offset = 0;
+    size_t blocks     = 0;
+    const char* name  = nullptr;
+    bool recursion    = false;
+    bool can_inplace  = true;
+    bool inplace      = false;
+    bool to_scratch   = false;
+    bool need_reorder = true;
+
+    void initialize(size_t size) { do_initialize(size); }
+
+    virtual void dump() const
+    {
+        printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
+               stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
+               to_scratch);
+    }
+
+    KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        do_execute(cdirect_t(), out, in, temp);
+    }
+    KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        do_execute(cinvert_t(), out, in, temp);
+    }
+    virtual ~dft_stage() {}
+
+protected:
+    virtual void do_initialize(size_t) {}
+    virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+    virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+};
+
+#define DFT_STAGE_FN                                                                                         \
+    void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override                     \
+    {                                                                                                        \
+        return do_execute<false>(out, in, temp);                                                             \
+    }                                                                                                        \
+    void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override                     \
+    {                                                                                                        \
+        return do_execute<true>(out, in, temp);                                                              \
+    }
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wassume")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
+#endif
+
+template <typename T>
+template <typename Stage, typename... Args>
+void dft_plan<T>::add_stage(Args... args)
+{
+    dft_stage<T>* stage = new Stage(args...);
+    stage->need_reorder = need_reorder;
+    this->data_size += stage->data_size;
+    this->temp_size += stage->temp_size;
+    stages.push_back(dft_stage_ptr(stage));
+}
+
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp
@@ -23,20 +23,17 @@
   disclosing the source code of your own applications.
   See https://www.kfrlib.com for details.
  */
+#pragma once
 
-#include "../dft_c.h"
-
-#include "../../base/basic_expressions.hpp"
-#include "../../testo/assert.hpp"
-#include "../cache.hpp"
-#include "../fft.hpp"
-#include "bitrev.hpp"
-#include "ft.hpp"
+#include "dft-fft.hpp"
 
 CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wshadow")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
 #endif
+#if CMT_HAS_WARNING("-Wunused-lambda-capture")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture")
+#endif
 
 CMT_PRAGMA_MSVC(warning(push))
 CMT_PRAGMA_MSVC(warning(disable : 4100))
@@ -44,439 +41,15 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
 namespace kfr
 {
 
-constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{};
-
-#define DFT_ASSERT TESTO_ASSERT_INACTIVE
-
-template <typename T>
-constexpr size_t fft_vector_width = platform<T>::vector_width;
-
-using cdirect_t = cfalse_t;
-using cinvert_t = ctrue_t;
-
-template <typename T>
-struct dft_stage
-{
-    size_t radix      = 0;
-    size_t stage_size = 0;
-    size_t data_size  = 0;
-    size_t temp_size  = 0;
-    u8* data          = nullptr;
-    size_t repeats    = 1;
-    size_t out_offset = 0;
-    size_t blocks     = 0;
-    const char* name  = nullptr;
-    bool recursion    = false;
-    bool can_inplace  = true;
-    bool inplace      = false;
-    bool to_scratch   = false;
-    bool need_reorder = true;
-
-    void initialize(size_t size) { do_initialize(size); }
-
-    virtual void dump() const
-    {
-        printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
-               stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
-               to_scratch);
-    }
-
-    KFR_INTRIN void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
-    {
-        do_execute(cdirect_t(), out, in, temp);
-    }
-    KFR_INTRIN void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
-    {
-        do_execute(cinvert_t(), out, in, temp);
-    }
-    virtual ~dft_stage() {}
-
-protected:
-    virtual void do_initialize(size_t) {}
-    virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
-    virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
-};
-
-#define DFT_STAGE_FN                                                                                         \
-    void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override                     \
-    {                                                                                                        \
-        return do_execute<false>(out, in, temp);                                                             \
-    }                                                                                                        \
-    void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override                     \
-    {                                                                                                        \
-        return do_execute<true>(out, in, temp);                                                              \
-    }
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-#if CMT_HAS_WARNING("-Wassume")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
-#endif
-
-namespace internal
-{
-
-template <size_t width, bool inverse, typename T>
-KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>,
-                                                const cvec<T, width>& w, const cvec<T, width>& tw)
-{
-    cvec<T, width> ww  = w;
-    cvec<T, width> tw_ = tw;
-    cvec<T, width> b1  = ww * dupeven(tw_);
-    ww                 = swap<2>(ww);
-
-    if (inverse)
-        tw_ = -(tw_);
-    ww = subadd(b1, ww * dupodd(tw_));
-    return ww;
-}
-
-template <size_t width, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>,
-                             cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in,
-                             const complex<T>* twiddle)
-{
-    const size_t N4 = N / 4;
-    cvec<T, width> w1, w2, w3;
-
-    cvec<T, width> sum02, sum13, diff02, diff13;
-
-    cvec<T, width> a0, a1, a2, a3;
-    a0    = cread<width, aligned>(in + 0);
-    a2    = cread<width, aligned>(in + N4 * 2);
-    sum02 = a0 + a2;
-
-    a1    = cread<width, aligned>(in + N4);
-    a3    = cread<width, aligned>(in + N4 * 3);
-    sum13 = a1 + a3;
-
-    cwrite<width, aligned>(out, sum02 + sum13);
-    w2 = sum02 - sum13;
-    cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2),
-                           radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2,
-                                                cread<width, true>(twiddle + width)));
-    diff02 = a0 - a2;
-    diff13 = a1 - a3;
-    if (inverse)
-    {
-        diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
-        diff13 = swap<2>(diff13);
-    }
-    else
-    {
-        diff13 = swap<2>(diff13);
-        diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
-    }
-
-    w1 = diff02 + diff13;
-
-    cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1),
-                           radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1,
-                                                cread<width, true>(twiddle + 0)));
-    w3 = diff02 - diff13;
-    cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(),
-                                                              w3, cread<width, true>(twiddle + width * 2)));
-}
-
-template <size_t width, bool inverse, typename T>
-KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>,
-                                                const cvec<T, width>& w, const cvec<T, width>& tw)
-{
-    vec<T, width> re1, im1, twre, twim;
-    split(w, re1, im1);
-    split(tw, twre, twim);
-
-    const vec<T, width> b1re = re1 * twre;
-    const vec<T, width> b1im = im1 * twre;
-    if (inverse)
-        return concat(b1re + im1 * twim, b1im - re1 * twim);
-    else
-        return concat(b1re - im1 * twim, b1im + re1 * twim);
-}
-
-template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>,
-                             cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
-                             const complex<T>* in, const complex<T>* twiddle)
-{
-    const size_t N4 = N / 4;
-    cvec<T, width> w1, w2, w3;
-    constexpr bool read_split  = !splitin && splitout;
-    constexpr bool write_split = splitin && !splitout;
-
-    vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3;
-
-    split(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0);
-    split(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
-    split(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2);
-    split(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
-
-    const vec<T, width> sum02re = re0 + re2;
-    const vec<T, width> sum02im = im0 + im2;
-    const vec<T, width> sum13re = re1 + re3;
-    const vec<T, width> sum13im = im1 + im3;
-
-    cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im));
-    w2 = concat(sum02re - sum13re, sum02im - sum13im);
-    cwrite_split<width, aligned, write_split>(
-        out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2,
-                                                           cread<width, true>(twiddle + width)));
-
-    const vec<T, width> diff02re = re0 - re2;
-    const vec<T, width> diff02im = im0 - im2;
-    const vec<T, width> diff13re = re1 - re3;
-    const vec<T, width> diff13im = im1 - im3;
-
-    (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
-    (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
-
-    cwrite_split<width, aligned, write_split>(
-        out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1,
-                                                           cread<width, true>(twiddle + 0)));
-    cwrite_split<width, aligned, write_split>(
-        out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3,
-                                           cread<width, true>(twiddle + width * 2)));
-}
-
-template <typename T>
-CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
-{
-    if (n == 0)
-    {
-        return make_vector(static_cast<T>(1), static_cast<T>(0));
-    }
-    else if (n == size / 4)
-    {
-        return make_vector(static_cast<T>(0), static_cast<T>(-1));
-    }
-    else if (n == size / 2)
-    {
-        return make_vector(static_cast<T>(-1), static_cast<T>(0));
-    }
-    else if (n == size * 3 / 4)
-    {
-        return make_vector(static_cast<T>(0), static_cast<T>(1));
-    }
-    else
-    {
-        fbase kth  = c_pi<fbase, 2> * (n / static_cast<fbase>(size));
-        fbase tcos = +kfr::cos(kth);
-        fbase tsin = -kfr::sin(kth);
-        return make_vector(static_cast<T>(tcos), static_cast<T>(tsin));
-    }
-}
-
-template <typename T, size_t width>
-KFR_SINTRIN void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size,
-                                          bool split_format)
-{
-    vec<T, 2 * width> result = T();
-    CMT_LOOP_UNROLL
-    for (size_t i = 0; i < width; i++)
-    {
-        const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size);
-        result[i * 2]      = r[0];
-        result[i * 2 + 1]  = r[1];
-    }
-    if (split_format)
-        ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result);
-    else
-        ref_cast<cvec<T, width>>(twiddle[0]) = result;
-    twiddle += width;
-}
-
-template <typename T, size_t width>
-CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
-{
-    const size_t count = stage_size / 4;
-    size_t nnstep      = size / stage_size;
-    DFT_ASSERT(width <= count);
-    CMT_LOOP_NOUNROLL
-    for (size_t n = 0; n < count; n += width)
-    {
-        initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format);
-        initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format);
-        initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format);
-    }
-}
-
-#if defined CMT_ARCH_SSE
-#ifdef CMT_COMPILER_GNU
-#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
-#else
-#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0);
-#endif
-#else
-#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr));
-#endif
-
-template <typename T>
-KFR_SINTRIN void prefetch_one(const complex<T>* in)
-{
-    KFR_PREFETCH(in);
-}
-
-template <typename T>
-KFR_SINTRIN void prefetch_four(size_t stride, const complex<T>* in)
-{
-    KFR_PREFETCH(in);
-    KFR_PREFETCH(in + stride);
-    KFR_PREFETCH(in + stride * 2);
-    KFR_PREFETCH(in + stride * 3);
-}
-
-template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
-          bool inverse, bool aligned, typename T>
-KFR_SINTRIN cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>, cbool_t<splitin>,
-                                 cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
-                                 complex<T>* out, const complex<T>* in, const complex<T>*& twiddle)
-{
-    constexpr static size_t prefetch_offset = width * 8;
-    const auto N4                           = N / csize_t<4>();
-    const auto N43                          = N4 * csize_t<3>();
-    CMT_ASSUME(blocks > 0);
-    CMT_ASSUME(N > 0);
-    CMT_ASSUME(N4 > 0);
-    DFT_ASSERT(width <= N4);
-    CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
-    {
-        CMT_PRAGMA_CLANG(clang loop unroll_count(2))
-        for (size_t n2 = 0; n2 < N4; n2 += width)
-        {
-            if (prefetch)
-                prefetch_four(N4, in + prefetch_offset);
-            radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
-                        cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
-                        in, twiddle + n2 * 3);
-            in += width;
-            out += width;
-        }
-        in += N43;
-        out += N43;
-    }
-    twiddle += N43;
-    return {};
-}
-
-template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>,
-                                cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
-                                complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+inline namespace CMT_ARCH_NAME
 {
-    CMT_ASSUME(blocks > 0);
-    constexpr static size_t prefetch_offset = 32 * 4;
-    for (size_t b = 0; b < blocks; b++)
-    {
-        if (prefetch)
-            prefetch_four(csize_t<64>(), out + prefetch_offset);
-        cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
-        split(cread_split<8, aligned, splitin>(out + 0), w0, w1);
-        split(cread_split<8, aligned, splitin>(out + 8), w2, w3);
-        split(cread_split<8, aligned, splitin>(out + 16), w4, w5);
-        split(cread_split<8, aligned, splitin>(out + 24), w6, w7);
-
-        butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
-
-        w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
-        w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
-        w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
-        w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
-        w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
-        w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
-        w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
-
-        cvec<T, 8> z0, z1, z2, z3;
-        transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
-
-        butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
-        cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3)));
-        out += 32;
-    }
-    return {};
-}
-
-template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
-                                cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
-                                complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
-{
-    CMT_ASSUME(blocks > 0);
-    DFT_ASSERT(2 <= blocks);
-    constexpr static size_t prefetch_offset = width * 16;
-    for (size_t b = 0; b < blocks; b += 2)
-    {
-        if (prefetch)
-            prefetch_one(out + prefetch_offset);
-
-        cvec<T, 8> vlo = cread<8, aligned>(out + 0);
-        cvec<T, 8> vhi = cread<8, aligned>(out + 8);
-        butterfly8<inverse>(vlo);
-        butterfly8<inverse>(vhi);
-        vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo);
-        vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi);
-        cwrite<8, aligned>(out, vlo);
-        cwrite<8, aligned>(out + 8, vhi);
-        out += 16;
-    }
-    return {};
-}
-
-template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
-                                cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
-                                complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
-{
-    CMT_ASSUME(blocks > 0);
-    constexpr static size_t prefetch_offset = width * 4;
-    DFT_ASSERT(2 <= blocks);
-    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
-    for (size_t b = 0; b < blocks; b += 2)
-    {
-        if (prefetch)
-            prefetch_one(out + prefetch_offset);
-
-        cvec<T, 16> vlo = cread<16, aligned>(out);
-        cvec<T, 16> vhi = cread<16, aligned>(out + 16);
-        butterfly4<4, inverse>(vlo);
-        butterfly4<4, inverse>(vhi);
-        apply_twiddles4<0, 4, 4, inverse>(vlo);
-        apply_twiddles4<0, 4, 4, inverse>(vhi);
-        vlo = digitreverse4<2>(vlo);
-        vhi = digitreverse4<2>(vhi);
-        butterfly4<4, inverse>(vlo);
-        butterfly4<4, inverse>(vhi);
-
-        use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
-        use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
-        out += 32;
-    }
-    return {};
-}
+constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{};
 
-template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
-                                cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
-                                complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+namespace intrinsics
 {
-    constexpr static size_t prefetch_offset = width * 4;
-    CMT_ASSUME(blocks > 0);
-    DFT_ASSERT(4 <= blocks);
-    CMT_LOOP_NOUNROLL
-    for (size_t b = 0; b < blocks; b += 4)
-    {
-        if (prefetch)
-            prefetch_one(out + prefetch_offset);
-
-        cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out);
-        butterfly4<4, inverse>(v16);
-        cdigitreverse4_write<aligned>(out, v16);
-
-        out += 4 * 4;
-    }
-    return {};
-}
 
 template <typename T>
-static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
+void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
 {
     complex<T>* twiddle = ptr_cast<complex<T>>(stage->data);
     const size_t N      = stage->repeats * stage->radix;
@@ -507,7 +80,7 @@ static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
 template <typename T, size_t fixed_radix>
 struct dft_stage_fixed_impl : dft_stage<T>
 {
-    dft_stage_fixed_impl(size_t radix_, size_t iterations, size_t blocks)
+    dft_stage_fixed_impl(size_t, size_t iterations, size_t blocks)
     {
         this->name      = type_name<decltype(*this)>();
         this->radix     = fixed_radix;
@@ -523,11 +96,11 @@ struct dft_stage_fixed_impl : dft_stage<T>
     constexpr static size_t width = fixed_radix >= 7
                                         ? fft_vector_width<T> / 2
                                         : fixed_radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2;
-    virtual void do_initialize(size_t size) override final { dft_stage_fixed_initialize(this, width); }
+    virtual void do_initialize(size_t) override final { dft_stage_fixed_initialize(this, width); }
 
     DFT_STAGE_FN
     template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         const size_t Nord         = this->repeats;
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
@@ -546,7 +119,7 @@ struct dft_stage_fixed_impl : dft_stage<T>
 template <typename T, size_t fixed_radix>
 struct dft_stage_fixed_final_impl : dft_stage<T>
 {
-    dft_stage_fixed_final_impl(size_t radix_, size_t iterations, size_t blocks)
+    dft_stage_fixed_final_impl(size_t, size_t iterations, size_t blocks)
     {
         this->name        = type_name<decltype(*this)>();
         this->radix       = fixed_radix;
@@ -561,10 +134,9 @@ struct dft_stage_fixed_final_impl : dft_stage<T>
 
     DFT_STAGE_FN
     template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
-        const size_t b    = this->blocks;
-        const size_t size = b * fixed_radix;
+        const size_t b = this->blocks;
 
         butterflies(b, csize<width>, csize<fixed_radix>, cbool<inverse>, out, in, b);
     }
@@ -584,27 +156,32 @@ inline auto apply_conj(E& e, ctrue_t)
 
 /// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1]
 template <typename E>
-struct fft_inverse : expression_base<E>
+struct fft_inverse : internal::expression_with_arguments<E>
 {
     using value_type = value_type_of<E>;
 
-    CMT_INLINE fft_inverse(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {}
+    KFR_MEM_INTRINSIC fft_inverse(E&& expr) CMT_NOEXCEPT
+        : internal::expression_with_arguments<E>(std::forward<E>(expr))
+    {
+    }
 
-    CMT_INLINE vec<value_type, 1> operator()(cinput_t input, size_t index, vec_t<value_type, 1>) const
+    friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, cinput_t input,
+                                                             size_t index, vec_shape<value_type, 1>)
     {
-        return this->argument_first(input, index == 0 ? 0 : this->size() - index, vec_t<value_type, 1>());
+        return self.argument_first(input, index == 0 ? 0 : self.size() - index, vec_shape<value_type, 1>());
     }
 
     template <size_t N>
-    CMT_INLINE vec<value_type, N> operator()(cinput_t input, size_t index, vec_t<value_type, N>) const
+    friend KFR_MEM_INTRINSIC vec<value_type, N> get_elements(const fft_inverse& self, cinput_t input,
+                                                             size_t index, vec_shape<value_type, N>)
     {
         if (index == 0)
         {
             return concat(
-                this->argument_first(input, index, vec_t<value_type, 1>()),
-                reverse(this->argument_first(input, this->size() - (N - 1), vec_t<value_type, N - 1>())));
+                self.argument_first(input, index, vec_shape<value_type, 1>()),
+                reverse(self.argument_first(input, self.size() - (N - 1), vec_shape<value_type, N - 1>())));
         }
-        return reverse(this->argument_first(input, this->size() - index - (N - 1), vec_t<value_type, N>()));
+        return reverse(self.argument_first(input, self.size() - index - (N - 1), vec_shape<value_type, N>()));
     }
 };
 
@@ -618,7 +195,7 @@ template <typename T>
 struct dft_arblen_stage_impl : dft_stage<T>
 {
     dft_arblen_stage_impl(size_t size)
-        : fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal), size(size)
+        : size(size), fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal)
     {
         this->name        = type_name<decltype(*this)>();
         this->radix       = size;
@@ -642,10 +219,9 @@ struct dft_arblen_stage_impl : dft_stage<T>
 
     DFT_STAGE_FN
     template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
     {
-        const size_t n  = this->size;
-        const size_t N2 = this->fftsize;
+        const size_t n = this->size;
 
         auto&& chirp = apply_conj(chirp_, cbool<inverse>);
 
@@ -703,7 +279,7 @@ struct dft_special_stage_impl : dft_stage<T>
     }
     DFT_STAGE_FN
     template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
     {
         complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size);
         stage1.do_execute(cbool<inverse>, scratch, in, temp);
@@ -730,7 +306,7 @@ struct dft_stage_generic_impl : dft_stage<T>
     }
 
 protected:
-    virtual void do_initialize(size_t size) override final
+    virtual void do_initialize(size_t) override final
     {
         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         CMT_LOOP_NOUNROLL
@@ -746,12 +322,10 @@ protected:
 
     DFT_STAGE_FN
     template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
     {
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         const size_t bl           = this->blocks;
-        const size_t Nord         = this->repeats;
-        const size_t N            = Nord * this->radix;
 
         CMT_LOOP_NOUNROLL
         for (size_t b = 0; b < bl; b++)
@@ -848,7 +422,7 @@ protected:
 
     DFT_STAGE_FN
     template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         cswitch(dft_radices, radices[0],
                 [&](auto first_radix) {
@@ -883,441 +457,7 @@ protected:
                 });
     }
 };
-
-template <typename T, bool splitin, bool is_even>
-struct fft_stage_impl : dft_stage<T>
-{
-    fft_stage_impl(size_t stage_size)
-    {
-        this->name       = type_name<decltype(*this)>();
-        this->radix      = 4;
-        this->stage_size = stage_size;
-        this->repeats    = 4;
-        this->recursion  = true;
-        this->data_size =
-            align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
-    }
-
-protected:
-    constexpr static bool prefetch = true;
-    constexpr static bool aligned  = false;
-    constexpr static size_t width  = fft_vector_width<T>;
-
-    virtual void do_initialize(size_t size) override final
-    {
-        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        initialize_twiddles<T, width>(twiddle, this->stage_size, size, true);
-    }
-
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        if (splitin)
-            in = out;
-        const size_t stg_size = this->stage_size;
-        CMT_ASSUME(stg_size >= 2048);
-        CMT_ASSUME(stg_size % 2048 == 0);
-        radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
-    }
-};
-
-template <typename T, bool splitin, size_t size>
-struct fft_final_stage_impl : dft_stage<T>
-{
-    fft_final_stage_impl(size_t)
-    {
-        this->name       = type_name<decltype(*this)>();
-        this->radix      = size;
-        this->stage_size = size;
-        this->out_offset = size;
-        this->repeats    = 4;
-        this->recursion  = true;
-        this->data_size  = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment);
-    }
-
-protected:
-    constexpr static size_t width  = fft_vector_width<T>;
-    constexpr static bool is_even  = cometa::is_even(ilog2(size));
-    constexpr static bool use_br2  = !is_even;
-    constexpr static bool aligned  = false;
-    constexpr static bool prefetch = splitin;
-
-    KFR_INTRIN void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
-    KFR_INTRIN void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
-
-    template <size_t N, bool pass_splitin>
-    KFR_INTRIN void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>, complex<T>*& twiddle)
-    {
-        constexpr bool pass_split   = N / 4 > 8 && N / 4 / 4 >= width;
-        constexpr size_t pass_width = const_min(width, N / 4);
-        initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_split || pass_splitin);
-        init_twiddles(csize<N / 4>, total_size, cbool<pass_split>, twiddle);
-    }
-
-    virtual void do_initialize(size_t total_size) override final
-    {
-        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle);
-    }
-
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle);
-    }
-
-    template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
-    KFR_INTRIN void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                const complex<T>*& twiddle)
-    {
-        radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-    }
-
-    template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
-    KFR_INTRIN void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                const complex<T>*& twiddle)
-    {
-        radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-    }
-
-    template <bool inverse>
-    KFR_INTRIN void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                const complex<T>*& twiddle)
-    {
-        radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-    }
-
-    template <bool inverse>
-    KFR_INTRIN void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                const complex<T>*& twiddle)
-    {
-        radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-    }
-
-    template <bool inverse, size_t N, bool pass_splitin>
-    KFR_INTRIN void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
-                                const complex<T>* in, const complex<T>*& twiddle)
-    {
-        static_assert(N > 8, "");
-        constexpr bool pass_split   = N / 4 > 8 && N / 4 / 4 >= width;
-        constexpr size_t pass_width = const_min(width, N / 4);
-        static_assert(pass_width == width || (pass_split == pass_splitin), "");
-        static_assert(pass_width <= N / 4, "");
-        radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_split>, cbool_t<pass_splitin>(),
-                    cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in,
-                    twiddle);
-        final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_split>, out, out, twiddle);
-    }
-};
-
-template <typename T, bool is_even>
-struct fft_reorder_stage_impl : dft_stage<T>
-{
-    fft_reorder_stage_impl(size_t stage_size)
-    {
-        this->name       = type_name<decltype(*this)>();
-        this->stage_size = stage_size;
-        log2n            = ilog2(stage_size);
-        this->data_size  = 0;
-    }
-
-protected:
-    size_t log2n;
-
-    virtual void do_initialize(size_t) override final {}
-
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        fft_reorder(out, log2n, cbool_t<!is_even>());
-    }
-};
-
-template <typename T, size_t log2n>
-struct fft_specialization;
-
-template <typename T>
-struct fft_specialization<T, 1> : dft_stage<T>
-{
-    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
-    constexpr static bool aligned = false;
-    DFT_STAGE_FN
-
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        cvec<T, 1> a0, a1;
-        split(cread<2, aligned>(in), a0, a1);
-        cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1));
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 2> : dft_stage<T>
-{
-    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
-    constexpr static bool aligned = false;
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        cvec<T, 1> a0, a1, a2, a3;
-        split(cread<4>(in), a0, a1, a2, a3);
-        butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3);
-        cwrite<4>(out, concat(a0, a1, a2, a3));
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 3> : dft_stage<T>
-{
-    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
-    constexpr static bool aligned = false;
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        cvec<T, 8> v8 = cread<8, aligned>(in);
-        butterfly8<inverse>(v8);
-        cwrite<8, aligned>(out, v8);
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 4> : dft_stage<T>
-{
-    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
-    constexpr static bool aligned = false;
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        cvec<T, 16> v16 = cread<16, aligned>(in);
-        butterfly16<inverse>(v16);
-        cwrite<16, aligned>(out, v16);
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 5> : dft_stage<T>
-{
-    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
-    constexpr static bool aligned = false;
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        cvec<T, 32> v32 = cread<32, aligned>(in);
-        butterfly32<inverse>(v32);
-        cwrite<32, aligned>(out, v32);
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 6> : dft_stage<T>
-{
-    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
-    constexpr static bool aligned = false;
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 7> : dft_stage<T>
-{
-    fft_specialization(size_t)
-    {
-        this->name       = type_name<decltype(*this)>();
-        this->stage_size = 128;
-        this->data_size  = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
-    }
-
-protected:
-    constexpr static bool aligned        = false;
-    constexpr static size_t width        = platform<T>::vector_width;
-    constexpr static bool use_br2        = true;
-    constexpr static bool prefetch       = false;
-    constexpr static bool is_double      = sizeof(T) == 8;
-    constexpr static size_t final_size   = is_double ? 8 : 32;
-    constexpr static size_t split_format = final_size == 8;
-
-    virtual void do_initialize(size_t total_size) override final
-    {
-        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        initialize_twiddles<T, width>(twiddle, 128, total_size, split_format);
-        initialize_twiddles<T, width>(twiddle, 32, total_size, split_format);
-        initialize_twiddles<T, width>(twiddle, 8, total_size, split_format);
-    }
-
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
-        final_pass<inverse>(csize_t<final_size>(), out, in, twiddle);
-        if (this->need_reorder)
-            fft_reorder(out, csize_t<7>());
-    }
-
-    template <bool inverse>
-    KFR_INTRIN void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
-    {
-        radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
-                    cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
-        radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
-                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-        radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-    }
-
-    template <bool inverse>
-    KFR_INTRIN void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
-    {
-        radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
-                    cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
-        radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-    }
-};
-
-template <>
-struct fft_specialization<float, 8> : dft_stage<float>
-{
-    fft_specialization(size_t)
-    {
-        this->name      = type_name<decltype(*this)>();
-        this->temp_size = sizeof(complex<float>) * 256;
-    }
-
-protected:
-    using T = float;
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
-    {
-        complex<float>* scratch = ptr_cast<complex<float>>(temp);
-        if (out == in)
-        {
-            butterfly16_multi_flip<0, inverse>(scratch, out);
-            butterfly16_multi_flip<1, inverse>(scratch, out);
-            butterfly16_multi_flip<2, inverse>(scratch, out);
-            butterfly16_multi_flip<3, inverse>(scratch, out);
-
-            butterfly16_multi_natural<0, inverse>(out, scratch);
-            butterfly16_multi_natural<1, inverse>(out, scratch);
-            butterfly16_multi_natural<2, inverse>(out, scratch);
-            butterfly16_multi_natural<3, inverse>(out, scratch);
-        }
-        else
-        {
-            butterfly16_multi_flip<0, inverse>(out, in);
-            butterfly16_multi_flip<1, inverse>(out, in);
-            butterfly16_multi_flip<2, inverse>(out, in);
-            butterfly16_multi_flip<3, inverse>(out, in);
-
-            butterfly16_multi_natural<0, inverse>(out, out);
-            butterfly16_multi_natural<1, inverse>(out, out);
-            butterfly16_multi_natural<2, inverse>(out, out);
-            butterfly16_multi_natural<3, inverse>(out, out);
-        }
-    }
-};
-
-template <>
-struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
-{
-    using T = double;
-    fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
-    {
-        this->name = type_name<decltype(*this)>();
-    }
-
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
-        if (this->need_reorder)
-            fft_reorder(out, csize_t<8>());
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
-{
-    fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
-    {
-        this->name = type_name<decltype(*this)>();
-    }
-
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr);
-        if (this->need_reorder)
-            fft_reorder(out, csize_t<9>());
-    }
-};
-
-template <typename T>
-struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
-{
-    fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size)
-    {
-        this->name = type_name<decltype(*this)>();
-    }
-
-    DFT_STAGE_FN
-    template <bool inverse>
-    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
-    {
-        fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr);
-        if (this->need_reorder)
-            fft_reorder(out, 10, cfalse);
-    }
-};
-
-} // namespace internal
-
-//
-
-template <typename T>
-template <typename Stage, typename... Args>
-void dft_plan<T>::add_stage(Args... args)
-{
-    dft_stage<T>* stage = new Stage(args...);
-    stage->need_reorder = need_reorder;
-    this->data_size += stage->data_size;
-    this->temp_size += stage->temp_size;
-    stages.push_back(dft_stage_ptr(stage));
-}
+} // namespace intrinsics
 
 template <typename T>
 template <bool is_final>
@@ -1325,366 +465,83 @@ void dft_plan<T>::prepare_dft_stage(size_t radix, size_t iterations, size_t bloc
 {
     return cswitch(
         dft_radices, radix,
-        [&](auto radix) CMT_INLINE_LAMBDA {
-            add_stage<conditional<is_final, internal::dft_stage_fixed_final_impl<T, val_of(radix)>,
-                                  internal::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations,
-                                                                                     blocks);
+        [this, iterations, blocks](auto radix) CMT_INLINE_LAMBDA {
+            add_stage<conditional<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>,
+                                  intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations,
+                                                                                       blocks);
         },
-        [&]() { add_stage<internal::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks); });
+        [this, radix, iterations, blocks]() {
+            add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks);
+        });
 }
 
 template <typename T>
-template <bool is_even, bool first>
-void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>)
+void dft_plan<T>::init_dft(size_t size, dft_order)
 {
-    constexpr size_t final_size = is_even ? 1024 : 512;
-
-    if (stage_size >= 2048)
+    if (size == 60)
     {
-        add_stage<internal::fft_stage_impl<T, !first, is_even>>(stage_size);
-
-        make_fft(stage_size / 4, cbool_t<is_even>(), cfalse);
+        this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>();
     }
-    else
+    else if (size == 48)
     {
-        add_stage<internal::fft_final_stage_impl<T, !first, final_size>>(final_size);
+        this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>();
     }
-}
-
-template <typename T>
-struct reverse_wrapper
-{
-    T& iterable;
-};
-
-template <typename T>
-auto begin(reverse_wrapper<T> w)
-{
-    return std::rbegin(w.iterable);
-}
-
-template <typename T>
-auto end(reverse_wrapper<T> w)
-{
-    return std::rend(w.iterable);
-}
-
-template <typename T>
-reverse_wrapper<T> reversed(T&& iterable)
-{
-    return { iterable };
-}
-
-template <typename T>
-void dft_plan<T>::initialize()
-{
-    data          = autofree<u8>(data_size);
-    size_t offset = 0;
-    for (dft_stage_ptr& stage : stages)
-    {
-        stage->data = data.data() + offset;
-        stage->initialize(this->size);
-        offset += stage->data_size;
-    }
-
-    bool to_scratch     = false;
-    bool scratch_needed = false;
-    for (dft_stage_ptr& stage : reversed(stages))
-    {
-        if (to_scratch)
-        {
-            scratch_needed = true;
-        }
-        stage->to_scratch = to_scratch;
-        if (!stage->can_inplace)
-        {
-            to_scratch = !to_scratch;
-        }
-    }
-    if (scratch_needed || !stages[0]->can_inplace)
-        this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
-}
-
-template <typename T>
-const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
-                                         const complex<T>* scratch, bool in_scratch) const
-{
-    if (stage == 0)
-        return in_scratch ? scratch : in;
-    return stages[stage - 1]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
-{
-    return stages[stage]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-template <bool inverse>
-void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
-{
-    if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
-    {
-        return stages[0]->execute(cbool<inverse>, out, in, temp);
-    }
-    size_t stack[32] = { 0 };
-
-    complex<T>* scratch =
-        ptr_cast<complex<T>>(temp + this->temp_size -
-                             align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
-
-    bool in_scratch = !stages[0]->can_inplace && in == out;
-    if (in_scratch)
+    else
     {
-        internal::builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
-    }
-
-    const size_t count = stages.size();
+        size_t cur_size                = size;
+        constexpr size_t radices_count = dft_radices.back() + 1;
+        u8 count[radices_count]        = { 0 };
+        int radices[32]                = { 0 };
+        size_t radices_size            = 0;
 
-    for (size_t depth = 0; depth < count;)
-    {
-        if (stages[depth]->recursion)
-        {
-            size_t offset   = 0;
-            size_t rdepth   = depth;
-            size_t maxdepth = depth;
-            do
+        cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
+            while (cur_size && cur_size % val_of(radix) == 0)
             {
-                if (stack[rdepth] == stages[rdepth]->repeats)
-                {
-                    stack[rdepth] = 0;
-                    rdepth--;
-                }
-                else
-                {
-                    complex<T>* rout      = select_out(rdepth, out, scratch);
-                    const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
-                    stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
-                    offset += stages[rdepth]->out_offset;
-                    stack[rdepth]++;
-                    if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
-                        rdepth++;
-                    else
-                        maxdepth = rdepth;
-                }
-            } while (rdepth != depth);
-            depth = maxdepth + 1;
-        }
-        else
-        {
-            stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
-                                   select_in(depth, out, in, scratch, in_scratch), temp);
-            depth++;
-        }
-    }
-}
+                count[val_of(radix)]++;
+                cur_size /= val_of(radix);
+            }
+        });
 
-template <typename T>
-dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
-{
-    need_reorder = true;
-    if (is_poweroftwo(size))
-    {
-        const size_t log2n = ilog2(size);
-        cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n,
-                [&](auto log2n) {
-                    (void)log2n;
-                    constexpr size_t log2nv = val_of(decltype(log2n)());
-                    this->add_stage<internal::fft_specialization<T, log2nv>>(size);
-                },
-                [&]() {
-                    cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
-                        this->make_fft(size, is_even, ctrue);
-                        constexpr size_t is_evenv = val_of(decltype(is_even)());
-                        if (need_reorder)
-                            this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size);
-                    });
-                });
-    }
-#ifndef KFR_DFT_NO_NPo2
-    else
-    {
-        if (size == 60)
-        {
-            this->add_stage<internal::dft_special_stage_impl<T, 6, 10>>();
-        }
-        else if (size == 48)
+        if (cur_size >= 101)
         {
-            this->add_stage<internal::dft_special_stage_impl<T, 6, 8>>();
+            this->add_stage<intrinsics::dft_arblen_stage_impl<T>>(size);
         }
         else
         {
-            size_t cur_size                = size;
-            constexpr size_t radices_count = dft_radices.back() + 1;
-            u8 count[radices_count]        = { 0 };
-            int radices[32]                = { 0 };
-            size_t radices_size            = 0;
-
-            cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
-                while (cur_size && cur_size % val_of(radix) == 0)
-                {
-                    count[val_of(radix)]++;
-                    cur_size /= val_of(radix);
-                }
-            });
+            size_t blocks     = 1;
+            size_t iterations = size;
 
-            if (cur_size >= 101)
+            for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
             {
-                this->add_stage<internal::dft_arblen_stage_impl<T>>(size);
-            }
-            else
-            {
-                size_t blocks     = 1;
-                size_t iterations = size;
-
-                for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
-                {
-                    for (size_t i = 0; i < count[r]; i++)
-                    {
-                        iterations /= r;
-                        radices[radices_size++] = r;
-                        if (iterations == 1)
-                            this->prepare_dft_stage(r, iterations, blocks, ctrue);
-                        else
-                            this->prepare_dft_stage(r, iterations, blocks, cfalse);
-                        blocks *= r;
-                    }
-                }
-
-                if (cur_size > 1)
+                for (size_t i = 0; i < count[r]; i++)
                 {
-                    iterations /= cur_size;
-                    radices[radices_size++] = cur_size;
+                    iterations /= r;
+                    radices[radices_size++] = r;
                     if (iterations == 1)
-                        this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+                        this->prepare_dft_stage(r, iterations, blocks, ctrue);
                     else
-                        this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+                        this->prepare_dft_stage(r, iterations, blocks, cfalse);
+                    blocks *= r;
                 }
-
-                if (stages.size() > 2)
-                    this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size);
             }
-        }
-    }
-#endif
-    initialize();
-}
-
-template <typename T>
-dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
-{
-    using namespace internal;
 
-    constexpr size_t width = platform<T>::vector_width * 2;
-
-    block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
-        constexpr size_t width = val_of(decltype(w)());
-        cwrite<width>(rtwiddle.data() + i,
-                      cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
-    });
-}
-
-template <typename T>
-void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
-{
-    using namespace internal;
-    size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
-
-    constexpr size_t width = platform<T>::vector_width * 2;
-    const cvec<T, 1> dc    = cread<1>(out);
-    const size_t count     = csize / 2;
-
-    block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
-        i++;
-        constexpr size_t width    = val_of(decltype(w)());
-        constexpr size_t widthm1  = width - 1;
-        const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
-        const cvec<T, width> fpk  = cread<width>(out + i);
-        const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1)));
-
-        const cvec<T, width> f1k = fpk + fpnk;
-        const cvec<T, width> f2k = fpk - fpnk;
-        const cvec<T, width> t   = cmul(f2k, tw);
-        cwrite<width>(out + i, T(0.5) * (f1k + t));
-        cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t))));
-    });
-
-    {
-        size_t k              = csize / 2;
-        const cvec<T, 1> fpk  = cread<1>(out + k);
-        const cvec<T, 1> fpnk = negodd(fpk);
-        cwrite<1>(out + k, fpnk);
-    }
-    if (fmt == dft_pack_format::CCs)
-    {
-        cwrite<1>(out, pack(dc[0] + dc[1], 0));
-        cwrite<1>(out + csize, pack(dc[0] - dc[1], 0));
-    }
-    else
-    {
-        cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1]));
-    }
-}
-
-template <typename T>
-void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const
-{
-    using namespace internal;
-
-    const size_t csize = this->size / 2;
-
-    cvec<T, 1> dc;
-
-    if (fmt == dft_pack_format::CCs)
-    {
-        dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real());
-    }
-    else
-    {
-        dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag());
-    }
-
-    constexpr size_t width = platform<T>::vector_width * 2;
-    const size_t count     = csize / 2;
-
-    block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
-        i++;
-        constexpr size_t width    = val_of(decltype(w)());
-        constexpr size_t widthm1  = width - 1;
-        const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
-        const cvec<T, width> fpk  = cread<width>(in + i);
-        const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
-
-        const cvec<T, width> f1k = fpk + fpnk;
-        const cvec<T, width> f2k = fpk - fpnk;
-        const cvec<T, width> t   = cmul_conj(f2k, tw);
-        cwrite<width>(out + i, f1k + t);
-        cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
-    });
+            if (cur_size > 1)
+            {
+                iterations /= cur_size;
+                radices[radices_size++] = cur_size;
+                if (iterations == 1)
+                    this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+                else
+                    this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+            }
 
-    {
-        size_t k              = csize / 2;
-        const cvec<T, 1> fpk  = cread<1>(in + k);
-        const cvec<T, 1> fpnk = 2 * negodd(fpk);
-        cwrite<1>(out + k, fpnk);
+            if (stages.size() > 2)
+                this->add_stage<intrinsics::dft_reorder_stage_impl<T>>(radices, radices_size);
+        }
     }
-    cwrite<1>(out, dc);
 }
 
-template <typename T>
-dft_plan<T>::~dft_plan()
-{
-}
-
-template <typename T>
-void dft_plan<T>::dump() const
-{
-    for (const dft_stage_ptr& s : stages)
-    {
-        s->dump();
-    }
-}
+} // namespace CMT_ARCH_NAME
 
 } // namespace kfr
 
diff --git a/include/kfr/dft/impl/dft-src.cpp b/include/kfr/dft/impl/dft-src.cpp
@@ -24,7 +24,8 @@
   See https://www.kfrlib.com for details.
  */
 
-#include "dft-impl.hpp"
+#include "../dft_c.h"
+#include "../fft.hpp"
 
 namespace kfr
 {
@@ -41,27 +42,26 @@ extern "C"
         return reinterpret_cast<KFR_DFT_PLAN_F64*>(new kfr::dft_plan<double>(size));
     }
 
-    void kfr_dft_execute_f32(KFR_DFT_PLAN_F32* plan, size_t size, float* out, const float* in, uint8_t* temp)
+    void kfr_dft_execute_f32(KFR_DFT_PLAN_F32* plan, size_t, float* out, const float* in, uint8_t* temp)
     {
         reinterpret_cast<kfr::dft_plan<float>*>(plan)->execute(
             reinterpret_cast<kfr::complex<float>*>(out), reinterpret_cast<const kfr::complex<float>*>(in),
             temp, kfr::cfalse);
     }
-    void kfr_dft_execute_f64(KFR_DFT_PLAN_F64* plan, size_t size, double* out, const double* in,
-                             uint8_t* temp)
+    void kfr_dft_execute_f64(KFR_DFT_PLAN_F64* plan, size_t, double* out, const double* in, uint8_t* temp)
     {
         reinterpret_cast<kfr::dft_plan<double>*>(plan)->execute(
             reinterpret_cast<kfr::complex<double>*>(out), reinterpret_cast<const kfr::complex<double>*>(in),
             temp, kfr::cfalse);
     }
-    void kfr_dft_execute_inverse_f32(KFR_DFT_PLAN_F32* plan, size_t size, float* out, const float* in,
+    void kfr_dft_execute_inverse_f32(KFR_DFT_PLAN_F32* plan, size_t, float* out, const float* in,
                                      uint8_t* temp)
     {
         reinterpret_cast<kfr::dft_plan<float>*>(plan)->execute(
             reinterpret_cast<kfr::complex<float>*>(out), reinterpret_cast<const kfr::complex<float>*>(in),
             temp, kfr::ctrue);
     }
-    void kfr_dft_execute_inverse_f64(KFR_DFT_PLAN_F64* plan, size_t size, double* out, const double* in,
+    void kfr_dft_execute_inverse_f64(KFR_DFT_PLAN_F64* plan, size_t, double* out, const double* in,
                                      uint8_t* temp)
     {
         reinterpret_cast<kfr::dft_plan<double>*>(plan)->execute(
@@ -89,29 +89,29 @@ extern "C"
         return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(new kfr::dft_plan_real<double>(size));
     }
 
-    void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, const float* in,
+    void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
                                   uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
     {
         reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
             reinterpret_cast<kfr::complex<float>*>(out), in, temp,
             static_cast<kfr::dft_pack_format>(pack_format));
     }
-    void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, const double* in,
+    void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
                                   uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
     {
         reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
             reinterpret_cast<kfr::complex<double>*>(out), in, temp,
             static_cast<kfr::dft_pack_format>(pack_format));
     }
-    void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out,
-                                          const float* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+    void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
+                                          uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
     {
         reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
             out, reinterpret_cast<const kfr::complex<float>*>(in), temp,
             static_cast<kfr::dft_pack_format>(pack_format));
     }
-    void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out,
-                                           const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+    void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
+                                           uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
     {
         reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
             out, reinterpret_cast<const kfr::complex<double>*>(in), temp,
diff --git a/include/kfr/dft/impl/dft-templates.hpp b/include/kfr/dft/impl/dft-templates.hpp
@@ -29,19 +29,13 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
-template dft_plan<FLOAT>::dft_plan(size_t, dft_order);
-template dft_plan<FLOAT>::~dft_plan();
-template void dft_plan<FLOAT>::dump() const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out,
-                                           const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out,
-                                           const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template dft_plan_real<FLOAT>::dft_plan_real(size_t);
-template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in,
-                                             kfr::dft_pack_format fmt) const;
-template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const;
-
+#ifndef KFR_DFT_NO_NPo2
+template void dft_plan<FLOAT>::init_dft(size_t, dft_order);
+#endif
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
 
 #endif
diff --git a/include/kfr/dft/impl/fft-impl-f32.cpp b/include/kfr/dft/impl/fft-impl-f32.cpp
@@ -0,0 +1,29 @@
+/** @addtogroup dft
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#include "fft-impl.hpp"
+
+#define FLOAT float
+#include "fft-templates.hpp"
diff --git a/include/kfr/dft/impl/fft-impl-f64.cpp b/include/kfr/dft/impl/fft-impl-f64.cpp
@@ -0,0 +1,29 @@
+/** @addtogroup dft
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#include "fft-impl.hpp"
+
+#define FLOAT double
+#include "fft-templates.hpp"
diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp
@@ -0,0 +1,1148 @@
+/** @addtogroup dft
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "dft-fft.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+#endif
+#if CMT_HAS_WARNING("-Wunused-lambda-capture")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture")
+#endif
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4100))
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <size_t width, bool inverse, typename T>
+KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>,
+                                                  const cvec<T, width>& w, const cvec<T, width>& tw)
+{
+    cvec<T, width> ww  = w;
+    cvec<T, width> tw_ = tw;
+    cvec<T, width> b1  = ww * dupeven(tw_);
+    ww                 = swap<2>(ww);
+
+    if (inverse)
+        tw_ = -(tw_);
+    ww = subadd(b1, ww * dupodd(tw_));
+    return ww;
+}
+
+template <size_t width, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>,
+                               cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in,
+                               const complex<T>* twiddle)
+{
+    const size_t N4 = N / 4;
+    cvec<T, width> w1, w2, w3;
+
+    cvec<T, width> sum02, sum13, diff02, diff13;
+
+    cvec<T, width> a0, a1, a2, a3;
+    a0    = cread<width, aligned>(in + 0);
+    a2    = cread<width, aligned>(in + N4 * 2);
+    sum02 = a0 + a2;
+
+    a1    = cread<width, aligned>(in + N4);
+    a3    = cread<width, aligned>(in + N4 * 3);
+    sum13 = a1 + a3;
+
+    cwrite<width, aligned>(out, sum02 + sum13);
+    w2 = sum02 - sum13;
+    cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2),
+                           radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2,
+                                                cread<width, true>(twiddle + width)));
+    diff02 = a0 - a2;
+    diff13 = a1 - a3;
+    if (inverse)
+    {
+        diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
+        diff13 = swap<2>(diff13);
+    }
+    else
+    {
+        diff13 = swap<2>(diff13);
+        diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
+    }
+
+    w1 = diff02 + diff13;
+
+    cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1),
+                           radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1,
+                                                cread<width, true>(twiddle + 0)));
+    w3 = diff02 - diff13;
+    cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(),
+                                                              w3, cread<width, true>(twiddle + width * 2)));
+}
+
+template <size_t width, bool inverse, typename T>
+KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>,
+                                                  const cvec<T, width>& w, const cvec<T, width>& tw)
+{
+    vec<T, width> re1, im1, twre, twim;
+    split(w, re1, im1);
+    split(tw, twre, twim);
+
+    const vec<T, width> b1re = re1 * twre;
+    const vec<T, width> b1im = im1 * twre;
+    if (inverse)
+        return concat(b1re + im1 * twim, b1im - re1 * twim);
+    else
+        return concat(b1re - im1 * twim, b1im + re1 * twim);
+}
+
+template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>,
+                               cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
+                               const complex<T>* in, const complex<T>* twiddle)
+{
+    const size_t N4 = N / 4;
+    cvec<T, width> w1, w2, w3;
+    constexpr bool read_split  = !splitin && splitout;
+    constexpr bool write_split = splitin && !splitout;
+
+    vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3;
+
+    split(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0);
+    split(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
+    split(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2);
+    split(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
+
+    const vec<T, width> sum02re = re0 + re2;
+    const vec<T, width> sum02im = im0 + im2;
+    const vec<T, width> sum13re = re1 + re3;
+    const vec<T, width> sum13im = im1 + im3;
+
+    cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im));
+    w2 = concat(sum02re - sum13re, sum02im - sum13im);
+    cwrite_split<width, aligned, write_split>(
+        out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2,
+                                                           cread<width, true>(twiddle + width)));
+
+    const vec<T, width> diff02re = re0 - re2;
+    const vec<T, width> diff02im = im0 - im2;
+    const vec<T, width> diff13re = re1 - re3;
+    const vec<T, width> diff13im = im1 - im3;
+
+    (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
+    (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
+
+    cwrite_split<width, aligned, write_split>(
+        out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1,
+                                                           cread<width, true>(twiddle + 0)));
+    cwrite_split<width, aligned, write_split>(
+        out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3,
+                                           cread<width, true>(twiddle + width * 2)));
+}
+
+template <typename T>
+CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
+{
+    if (n == 0)
+    {
+        return make_vector(static_cast<T>(1), static_cast<T>(0));
+    }
+    else if (n == size / 4)
+    {
+        return make_vector(static_cast<T>(0), static_cast<T>(-1));
+    }
+    else if (n == size / 2)
+    {
+        return make_vector(static_cast<T>(-1), static_cast<T>(0));
+    }
+    else if (n == size * 3 / 4)
+    {
+        return make_vector(static_cast<T>(0), static_cast<T>(1));
+    }
+    else
+    {
+        fbase kth  = c_pi<fbase, 2> * (n / static_cast<fbase>(size));
+        fbase tcos = +kfr::cos(kth);
+        fbase tsin = -kfr::sin(kth);
+        return make_vector(static_cast<T>(tcos), static_cast<T>(tsin));
+    }
+}
+
+template <typename T, size_t width>
+KFR_INTRINSIC void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size,
+                                            bool split_format)
+{
+    vec<T, 2 * width> result = T();
+    CMT_LOOP_UNROLL
+    for (size_t i = 0; i < width; i++)
+    {
+        const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size);
+        result[i * 2]      = r[0];
+        result[i * 2 + 1]  = r[1];
+    }
+    if (split_format)
+        ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result);
+    else
+        ref_cast<cvec<T, width>>(twiddle[0]) = result;
+    twiddle += width;
+}
+
+template <typename T, size_t width>
+CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
+{
+    const size_t count = stage_size / 4;
+    size_t nnstep      = size / stage_size;
+    DFT_ASSERT(width <= count);
+    CMT_LOOP_NOUNROLL
+    for (size_t n = 0; n < count; n += width)
+    {
+        initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format);
+        initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format);
+        initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format);
+    }
+}
+
+#ifdef KFR_NO_PREFETCH
+#define KFR_PREFETCH(addr)                                                                                   \
+    do                                                                                                       \
+    {                                                                                                        \
+        (void)(addr);                                                                                        \
+    } while (0)
+#else
+
+#if defined CMT_ARCH_SSE
+#ifdef CMT_COMPILER_GNU
+#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
+#else
+#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0);
+#endif
+#else
+#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr));
+#endif
+#endif
+
+template <typename T>
+KFR_INTRINSIC void prefetch_one(const complex<T>* in)
+{
+    KFR_PREFETCH(in);
+}
+
+template <typename T>
+KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in)
+{
+    KFR_PREFETCH(in);
+    KFR_PREFETCH(in + stride);
+    KFR_PREFETCH(in + stride * 2);
+    KFR_PREFETCH(in + stride * 3);
+}
+
+template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
+          bool inverse, bool aligned, typename T>
+KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>,
+                                   cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>,
+                                   cbool_t<aligned>, complex<T>* out, const complex<T>* in,
+                                   const complex<T>*& twiddle)
+{
+    constexpr static size_t prefetch_offset = width * 8;
+    const auto N4                           = N / csize_t<4>();
+    const auto N43                          = N4 * csize_t<3>();
+    CMT_ASSUME(blocks > 0);
+    CMT_ASSUME(N > 0);
+    CMT_ASSUME(N4 > 0);
+    DFT_ASSERT(width <= N4);
+    CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
+    {
+        CMT_PRAGMA_CLANG(clang loop unroll_count(2))
+        for (size_t n2 = 0; n2 < N4; n2 += width)
+        {
+            if (prefetch)
+                prefetch_four(N4, in + prefetch_offset);
+            radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
+                        cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
+                        in, twiddle + n2 * 3);
+            in += width;
+            out += width;
+        }
+        in += N43;
+        out += N43;
+    }
+    twiddle += N43;
+    return {};
+}
+
+template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>,
+                                  cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+                                  complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+    CMT_ASSUME(blocks > 0);
+    constexpr static size_t prefetch_offset = 32 * 4;
+    for (size_t b = 0; b < blocks; b++)
+    {
+        if (prefetch)
+            prefetch_four(csize_t<64>(), out + prefetch_offset);
+        cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
+        split(cread_split<8, aligned, splitin>(out + 0), w0, w1);
+        split(cread_split<8, aligned, splitin>(out + 8), w2, w3);
+        split(cread_split<8, aligned, splitin>(out + 16), w4, w5);
+        split(cread_split<8, aligned, splitin>(out + 24), w6, w7);
+
+        butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
+
+        w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
+        w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
+        w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
+        w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
+        w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
+        w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
+        w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
+
+        cvec<T, 8> z0, z1, z2, z3;
+        transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
+
+        butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
+        cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3)));
+        out += 32;
+    }
+    return {};
+}
+
+template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
+                                  cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+                                  complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+    CMT_ASSUME(blocks > 0);
+    DFT_ASSERT(2 <= blocks);
+    constexpr static size_t prefetch_offset = width * 16;
+    for (size_t b = 0; b < blocks; b += 2)
+    {
+        if (prefetch)
+            prefetch_one(out + prefetch_offset);
+
+        cvec<T, 8> vlo = cread<8, aligned>(out + 0);
+        cvec<T, 8> vhi = cread<8, aligned>(out + 8);
+        butterfly8<inverse>(vlo);
+        butterfly8<inverse>(vhi);
+        vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo);
+        vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi);
+        cwrite<8, aligned>(out, vlo);
+        cwrite<8, aligned>(out + 8, vhi);
+        out += 16;
+    }
+    return {};
+}
+
+template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
+                                  cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+                                  complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+    CMT_ASSUME(blocks > 0);
+    constexpr static size_t prefetch_offset = width * 4;
+    DFT_ASSERT(2 <= blocks);
+    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
+    for (size_t b = 0; b < blocks; b += 2)
+    {
+        if (prefetch)
+            prefetch_one(out + prefetch_offset);
+
+        cvec<T, 16> vlo = cread<16, aligned>(out);
+        cvec<T, 16> vhi = cread<16, aligned>(out + 16);
+        butterfly4<4, inverse>(vlo);
+        butterfly4<4, inverse>(vhi);
+        apply_twiddles4<0, 4, 4, inverse>(vlo);
+        apply_twiddles4<0, 4, 4, inverse>(vhi);
+        vlo = digitreverse4<2>(vlo);
+        vhi = digitreverse4<2>(vhi);
+        butterfly4<4, inverse>(vlo);
+        butterfly4<4, inverse>(vhi);
+
+        use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
+        use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
+        out += 32;
+    }
+    return {};
+}
+
+template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
+                                  cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+                                  complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+    constexpr static size_t prefetch_offset = width * 4;
+    CMT_ASSUME(blocks > 0);
+    DFT_ASSERT(4 <= blocks);
+    CMT_LOOP_NOUNROLL
+    for (size_t b = 0; b < blocks; b += 4)
+    {
+        if (prefetch)
+            prefetch_one(out + prefetch_offset);
+
+        cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out);
+        butterfly4<4, inverse>(v16);
+        cdigitreverse4_write<aligned>(out, v16);
+
+        out += 4 * 4;
+    }
+    return {};
+}
+
+template <typename T, bool splitin, bool is_even>
+struct fft_stage_impl : dft_stage<T>
+{
+    fft_stage_impl(size_t stage_size)
+    {
+        this->name       = type_name<decltype(*this)>();
+        this->radix      = 4;
+        this->stage_size = stage_size;
+        this->repeats    = 4;
+        this->recursion  = true;
+        this->data_size =
+            align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
+    }
+
+protected:
+    constexpr static bool prefetch = true;
+    constexpr static bool aligned  = false;
+    constexpr static size_t width  = fft_vector_width<T>;
+
+    virtual void do_initialize(size_t size) override final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddles<T, width>(twiddle, this->stage_size, size, true);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        if (splitin)
+            in = out;
+        const size_t stg_size = this->stage_size;
+        CMT_ASSUME(stg_size >= 2048);
+        CMT_ASSUME(stg_size % 2048 == 0);
+        radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+    }
+};
+
+template <typename T, bool splitin, size_t size>
+struct fft_final_stage_impl : dft_stage<T>
+{
+    fft_final_stage_impl(size_t)
+    {
+        this->name       = type_name<decltype(*this)>();
+        this->radix      = size;
+        this->stage_size = size;
+        this->out_offset = size;
+        this->repeats    = 4;
+        this->recursion  = true;
+        this->data_size  = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment);
+    }
+
+protected:
+    constexpr static size_t width  = fft_vector_width<T>;
+    constexpr static bool is_even  = cometa::is_even(ilog2(size));
+    constexpr static bool use_br2  = !is_even;
+    constexpr static bool aligned  = false;
+    constexpr static bool prefetch = splitin;
+
+    KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
+    KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
+
+    template <size_t N, bool pass_splitin>
+    KFR_MEM_INTRINSIC void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>,
+                                         complex<T>*& twiddle)
+    {
+        constexpr bool pass_split   = N / 4 > 8 && N / 4 / 4 >= width;
+        constexpr size_t pass_width = const_min(width, N / 4);
+        initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_split || pass_splitin);
+        init_twiddles(csize<N / 4>, total_size, cbool<pass_split>, twiddle);
+    }
+
+    virtual void do_initialize(size_t total_size) override final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle);
+    }
+
+    template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+                                       const complex<T>*& twiddle)
+    {
+        radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+    }
+
+    template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+                                       const complex<T>*& twiddle)
+    {
+        radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+    }
+
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+                                       const complex<T>*& twiddle)
+    {
+        radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+    }
+
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+                                       const complex<T>*& twiddle)
+    {
+        radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+    }
+
+    template <bool inverse, size_t N, bool pass_splitin>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+                                       const complex<T>* in, const complex<T>*& twiddle)
+    {
+        static_assert(N > 8, "");
+        constexpr bool pass_split   = N / 4 > 8 && N / 4 / 4 >= width;
+        constexpr size_t pass_width = const_min(width, N / 4);
+        static_assert(pass_width == width || (pass_split == pass_splitin), "");
+        static_assert(pass_width <= N / 4, "");
+        radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_split>, cbool_t<pass_splitin>(),
+                    cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in,
+                    twiddle);
+        final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_split>, out, out, twiddle);
+    }
+};
+
+template <typename T, bool is_even>
+struct fft_reorder_stage_impl : dft_stage<T>
+{
+    fft_reorder_stage_impl(size_t stage_size)
+    {
+        this->name       = type_name<decltype(*this)>();
+        this->stage_size = stage_size;
+        log2n            = ilog2(stage_size);
+        this->data_size  = 0;
+    }
+
+protected:
+    size_t log2n;
+
+    virtual void do_initialize(size_t) override final {}
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*)
+    {
+        fft_reorder(out, log2n, cbool_t<!is_even>());
+    }
+};
+
+template <typename T, size_t log2n>
+struct fft_specialization;
+
+template <typename T>
+struct fft_specialization<T, 1> : dft_stage<T>
+{
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+    constexpr static bool aligned = false;
+    DFT_STAGE_FN
+
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        cvec<T, 1> a0, a1;
+        split(cread<2, aligned>(in), a0, a1);
+        cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1));
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 2> : dft_stage<T>
+{
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+    constexpr static bool aligned = false;
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        cvec<T, 1> a0, a1, a2, a3;
+        split(cread<4>(in), a0, a1, a2, a3);
+        butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3);
+        cwrite<4>(out, concat(a0, a1, a2, a3));
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 3> : dft_stage<T>
+{
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+    constexpr static bool aligned = false;
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        cvec<T, 8> v8 = cread<8, aligned>(in);
+        butterfly8<inverse>(v8);
+        cwrite<8, aligned>(out, v8);
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 4> : dft_stage<T>
+{
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+    constexpr static bool aligned = false;
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        cvec<T, 16> v16 = cread<16, aligned>(in);
+        butterfly16<inverse>(v16);
+        cwrite<16, aligned>(out, v16);
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 5> : dft_stage<T>
+{
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+    constexpr static bool aligned = false;
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        cvec<T, 32> v32 = cread<32, aligned>(in);
+        butterfly32<inverse>(v32);
+        cwrite<32, aligned>(out, v32);
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 6> : dft_stage<T>
+{
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+    constexpr static bool aligned = false;
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 7> : dft_stage<T>
+{
+    fft_specialization(size_t)
+    {
+        this->name       = type_name<decltype(*this)>();
+        this->stage_size = 128;
+        this->data_size  = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
+    }
+
+protected:
+    constexpr static bool aligned        = false;
+    constexpr static size_t width        = vector_width<T>;
+    constexpr static bool use_br2        = true;
+    constexpr static bool prefetch       = false;
+    constexpr static bool is_double      = sizeof(T) == 8;
+    constexpr static size_t final_size   = is_double ? 8 : 32;
+    constexpr static size_t split_format = final_size == 8;
+
+    virtual void do_initialize(size_t total_size) override final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddles<T, width>(twiddle, 128, total_size, split_format);
+        initialize_twiddles<T, width>(twiddle, 32, total_size, split_format);
+        initialize_twiddles<T, width>(twiddle, 8, total_size, split_format);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        final_pass<inverse>(csize_t<final_size>(), out, in, twiddle);
+        if (this->need_reorder)
+            fft_reorder(out, csize_t<7>());
+    }
+
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in,
+                                      const complex<T>* twiddle)
+    {
+        radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+    }
+
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in,
+                                      const complex<T>* twiddle)
+    {
+        radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+        radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+    }
+};
+
+template <>
+struct fft_specialization<float, 8> : dft_stage<float>
+{
+    fft_specialization(size_t)
+    {
+        this->name      = type_name<decltype(*this)>();
+        this->temp_size = sizeof(complex<float>) * 256;
+    }
+
+protected:
+    using T = float;
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        complex<float>* scratch = ptr_cast<complex<float>>(temp);
+        if (out == in)
+        {
+            butterfly16_multi_flip<0, inverse>(scratch, out);
+            butterfly16_multi_flip<1, inverse>(scratch, out);
+            butterfly16_multi_flip<2, inverse>(scratch, out);
+            butterfly16_multi_flip<3, inverse>(scratch, out);
+
+            butterfly16_multi_natural<0, inverse>(out, scratch);
+            butterfly16_multi_natural<1, inverse>(out, scratch);
+            butterfly16_multi_natural<2, inverse>(out, scratch);
+            butterfly16_multi_natural<3, inverse>(out, scratch);
+        }
+        else
+        {
+            butterfly16_multi_flip<0, inverse>(out, in);
+            butterfly16_multi_flip<1, inverse>(out, in);
+            butterfly16_multi_flip<2, inverse>(out, in);
+            butterfly16_multi_flip<3, inverse>(out, in);
+
+            butterfly16_multi_natural<0, inverse>(out, out);
+            butterfly16_multi_natural<1, inverse>(out, out);
+            butterfly16_multi_natural<2, inverse>(out, out);
+            butterfly16_multi_natural<3, inverse>(out, out);
+        }
+    }
+};
+
+template <>
+struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
+{
+    using T = double;
+    fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
+    {
+        this->name = type_name<decltype(*this)>();
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
+        if (this->need_reorder)
+            fft_reorder(out, csize_t<8>());
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
+{
+    fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
+    {
+        this->name = type_name<decltype(*this)>();
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr);
+        if (this->need_reorder)
+            fft_reorder(out, csize_t<9>());
+    }
+};
+
+template <typename T>
+struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
+{
+    fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size)
+    {
+        this->name = type_name<decltype(*this)>();
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr);
+        if (this->need_reorder)
+            fft_reorder(out, 10, cfalse);
+    }
+};
+
+} // namespace intrinsics
+
+template <typename T>
+template <bool is_even, bool first>
+void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>)
+{
+    constexpr size_t final_size = is_even ? 1024 : 512;
+
+    if (stage_size >= 2048)
+    {
+        add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(stage_size);
+
+        make_fft(stage_size / 4, cbool_t<is_even>(), cfalse);
+    }
+    else
+    {
+        add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(final_size);
+    }
+}
+
+template <typename T>
+struct reverse_wrapper
+{
+    T& iterable;
+};
+
+template <typename T>
+auto begin(reverse_wrapper<T> w)
+{
+    return std::rbegin(w.iterable);
+}
+
+template <typename T>
+auto end(reverse_wrapper<T> w)
+{
+    return std::rend(w.iterable);
+}
+
+template <typename T>
+reverse_wrapper<T> reversed(T&& iterable)
+{
+    return { iterable };
+}
+
+template <typename T>
+void dft_plan<T>::initialize()
+{
+    data          = autofree<u8>(data_size);
+    size_t offset = 0;
+    for (dft_stage_ptr& stage : stages)
+    {
+        stage->data = data.data() + offset;
+        stage->initialize(this->size);
+        offset += stage->data_size;
+    }
+
+    bool to_scratch     = false;
+    bool scratch_needed = false;
+    for (dft_stage_ptr& stage : reversed(stages))
+    {
+        if (to_scratch)
+        {
+            scratch_needed = true;
+        }
+        stage->to_scratch = to_scratch;
+        if (!stage->can_inplace)
+        {
+            to_scratch = !to_scratch;
+        }
+    }
+    if (scratch_needed || !stages[0]->can_inplace)
+        this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
+}
+
+template <typename T>
+const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
+                                         const complex<T>* scratch, bool in_scratch) const
+{
+    if (stage == 0)
+        return in_scratch ? scratch : in;
+    return stages[stage - 1]->to_scratch ? scratch : out;
+}
+
+template <typename T>
+complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
+{
+    return stages[stage]->to_scratch ? scratch : out;
+}
+
+template <typename T>
+template <bool inverse>
+void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
+{
+    if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
+    {
+        return stages[0]->execute(cbool<inverse>, out, in, temp);
+    }
+    size_t stack[32] = { 0 };
+
+    complex<T>* scratch =
+        ptr_cast<complex<T>>(temp + this->temp_size -
+                             align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
+
+    bool in_scratch = !stages[0]->can_inplace && in == out;
+    if (in_scratch)
+    {
+        builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
+    }
+
+    const size_t count = stages.size();
+
+    for (size_t depth = 0; depth < count;)
+    {
+        if (stages[depth]->recursion)
+        {
+            size_t offset   = 0;
+            size_t rdepth   = depth;
+            size_t maxdepth = depth;
+            do
+            {
+                if (stack[rdepth] == stages[rdepth]->repeats)
+                {
+                    stack[rdepth] = 0;
+                    rdepth--;
+                }
+                else
+                {
+                    complex<T>* rout      = select_out(rdepth, out, scratch);
+                    const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
+                    stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
+                    offset += stages[rdepth]->out_offset;
+                    stack[rdepth]++;
+                    if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
+                        rdepth++;
+                    else
+                        maxdepth = rdepth;
+                }
+            } while (rdepth != depth);
+            depth = maxdepth + 1;
+        }
+        else
+        {
+            stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
+                                   select_in(depth, out, in, scratch, in_scratch), temp);
+            depth++;
+        }
+    }
+}
+
+template <typename T>
+void dft_plan<T>::init_fft(size_t size, dft_order)
+{
+    const size_t log2n = ilog2(size);
+    cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n,
+            [&](auto log2n) {
+                (void)log2n;
+                constexpr size_t log2nv = val_of(decltype(log2n)());
+                this->add_stage<intrinsics::fft_specialization<T, log2nv>>(size);
+            },
+            [&]() {
+                cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
+                    this->make_fft(size, is_even, ctrue);
+                    constexpr size_t is_evenv = val_of(decltype(is_even)());
+                    if (need_reorder)
+                        this->add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(size);
+                });
+            });
+}
+
+template <typename T>
+dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
+{
+    need_reorder = true;
+    if (is_poweroftwo(size))
+    {
+        init_fft(size, order);
+    }
+#ifndef KFR_DFT_NO_NPo2
+    else
+    {
+        init_dft(size, order);
+    }
+#endif
+    initialize();
+}
+
+template <typename T>
+dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
+{
+    using namespace intrinsics;
+
+    constexpr size_t width = vector_width<T> * 2;
+
+    block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
+        constexpr size_t width = val_of(decltype(w)());
+        cwrite<width>(rtwiddle.data() + i,
+                      cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
+    });
+}
+
+template <typename T>
+void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
+{
+    using namespace intrinsics;
+    size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
+
+    constexpr size_t width = vector_width<T> * 2;
+    const cvec<T, 1> dc    = cread<1>(out);
+    const size_t count     = csize / 2;
+
+    block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
+        i++;
+        constexpr size_t width    = val_of(decltype(w)());
+        constexpr size_t widthm1  = width - 1;
+        const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
+        const cvec<T, width> fpk  = cread<width>(out + i);
+        const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1)));
+
+        const cvec<T, width> f1k = fpk + fpnk;
+        const cvec<T, width> f2k = fpk - fpnk;
+        const cvec<T, width> t   = cmul(f2k, tw);
+        cwrite<width>(out + i, T(0.5) * (f1k + t));
+        cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t))));
+    });
+
+    {
+        size_t k              = csize / 2;
+        const cvec<T, 1> fpk  = cread<1>(out + k);
+        const cvec<T, 1> fpnk = negodd(fpk);
+        cwrite<1>(out + k, fpnk);
+    }
+    if (fmt == dft_pack_format::CCs)
+    {
+        cwrite<1>(out, pack(dc[0] + dc[1], 0));
+        cwrite<1>(out + csize, pack(dc[0] - dc[1], 0));
+    }
+    else
+    {
+        cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1]));
+    }
+}
+
+template <typename T>
+void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const
+{
+    using namespace intrinsics;
+
+    const size_t csize = this->size / 2;
+
+    cvec<T, 1> dc;
+
+    if (fmt == dft_pack_format::CCs)
+    {
+        dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real());
+    }
+    else
+    {
+        dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag());
+    }
+
+    constexpr size_t width = vector_width<T> * 2;
+    const size_t count     = csize / 2;
+
+    block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
+        i++;
+        constexpr size_t width    = val_of(decltype(w)());
+        constexpr size_t widthm1  = width - 1;
+        const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
+        const cvec<T, width> fpk  = cread<width>(in + i);
+        const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
+
+        const cvec<T, width> f1k = fpk + fpnk;
+        const cvec<T, width> f2k = fpk - fpnk;
+        const cvec<T, width> t   = cmul_conj(f2k, tw);
+        cwrite<width>(out + i, f1k + t);
+        cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
+    });
+
+    {
+        size_t k              = csize / 2;
+        const cvec<T, 1> fpk  = cread<1>(in + k);
+        const cvec<T, 1> fpnk = 2 * negodd(fpk);
+        cwrite<1>(out + k, fpnk);
+    }
+    cwrite<1>(out, dc);
+}
+
+template <typename T>
+dft_plan<T>::~dft_plan()
+{
+}
+
+template <typename T>
+void dft_plan<T>::dump() const
+{
+    for (const dft_stage_ptr& s : stages)
+    {
+        s->dump();
+    }
+}
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/dft/impl/fft-templates.hpp b/include/kfr/dft/impl/fft-templates.hpp
@@ -0,0 +1,50 @@
+/** @addtogroup dft
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+
+#ifdef FLOAT
+#include "../fft.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template dft_plan<FLOAT>::dft_plan(size_t, dft_order);
+template void dft_plan<FLOAT>::init_fft(size_t, dft_order);
+template dft_plan<FLOAT>::~dft_plan();
+template void dft_plan<FLOAT>::dump() const;
+template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out,
+                                           const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
+template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out,
+                                           const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
+template dft_plan_real<FLOAT>::dft_plan_real(size_t);
+template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in,
+                                             kfr::dft_pack_format fmt) const;
+template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const;
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+#endif
diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp
@@ -25,40 +25,45 @@
  */
 #pragma once
 
-#include "../../base/complex.hpp"
-#include "../../base/constants.hpp"
-#include "../../base/digitreverse.hpp"
-#include "../../base/read_write.hpp"
-#include "../../base/sin_cos.hpp"
 #include "../../base/small_buffer.hpp"
 #include "../../base/univector.hpp"
-#include "../../base/vec.hpp"
+#include "../../math/sin_cos.hpp"
+#include "../../simd/complex.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/digitreverse.hpp"
+#include "../../simd/read_write.hpp"
+#include "../../simd/vec.hpp"
 
 #include "../../base/memory.hpp"
-#include "../../data/sincos.hpp"
+#include "../data/sincos.hpp"
 
 CMT_PRAGMA_MSVC(warning(push))
 CMT_PRAGMA_MSVC(warning(disable : 4127))
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, size_t N>
+using cvec = vec<T, N * 2>;
 
-namespace internal
+namespace intrinsics
 {
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y)
 {
     return subadd(x * dupeven(y), swap<2>(x) * dupodd(y));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y)
+KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y)
 {
     vec<T, N> yy = resize<N>(y);
     return cmul_impl(x, yy);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
 {
     vec<T, N> xx = resize<N>(x);
     return cmul_impl(xx, y);
@@ -66,24 +71,24 @@ CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
 
 /// Complex Multiplication
 template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y)
+KFR_INTRINSIC vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y)
 {
-    return internal::cmul_impl(x, y);
+    return intrinsics::cmul_impl(x, y);
 }
 
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y)
 {
     return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y)));
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw)
+KFR_INTRINSIC vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw)
 {
     return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1,
-                           const vec<T, N>& tw)
+KFR_INTRINSIC void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1,
+                              const vec<T, N>& tw)
 {
     const vec<T, N> twr   = dupeven(tw);
     const vec<T, N> twi   = dupodd(tw);
@@ -95,82 +100,79 @@ CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in
     out1 += sumtw - diftw;
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y)
+KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y)
 {
     vec<T, N> yy = resize<N>(y);
     return cmul_conj(x, yy);
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y)
 {
     vec<T, N> xx = resize<N>(x);
     return cmul_conj(xx, y);
 }
 
-template <typename T, size_t N>
-using cvec = vec<T, N * 2>;
-
 template <size_t N, bool A = false, typename T>
-CMT_INLINE cvec<T, N> cread(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src)
 {
     return cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
 }
 
 template <size_t N, bool A = false, typename T>
-CMT_INLINE void cwrite(complex<T>* dest, const cvec<T, N>& value)
+KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value)
 {
     value.write(ptr_cast<T>(dest));
 }
 
 template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
-CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
+KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
 {
     return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
 }
 template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
-CMT_INLINE void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>)
+KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>)
 {
     swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
 }
 
 template <size_t count, size_t N, bool A, typename T, size_t... indices>
-CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
+KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
 {
     return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
 }
 template <size_t count, size_t N, bool A, typename T, size_t... indices>
-CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value,
-                                  csizes_t<indices...>)
+KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value,
+                                     csizes_t<indices...>)
 {
     swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
 }
 
 template <size_t count, size_t N, size_t stride, bool A = false, typename T>
-CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src)
+KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src)
 {
     return cread_group_impl<count, N, stride, A>(src, csizeseq_t<count>());
 }
 
 template <size_t count, size_t N, size_t stride, bool A = false, typename T>
-CMT_INLINE void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value)
+KFR_INTRINSIC void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value)
 {
     return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq_t<count>());
 }
 
 template <size_t count, size_t N, bool A = false, typename T>
-CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
+KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
 {
     return cread_group_impl<count, N, A>(src, stride, csizeseq_t<count>());
 }
 
 template <size_t count, size_t N, bool A = false, typename T>
-CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value)
+KFR_INTRINSIC void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value)
 {
     return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq_t<count>());
 }
 
 template <size_t N, bool A = false, bool split = false, typename T>
-CMT_INLINE cvec<T, N> cread_split(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src)
 {
     cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
     if (split)
@@ -179,7 +181,7 @@ CMT_INLINE cvec<T, N> cread_split(const complex<T>* src)
 }
 
 template <size_t N, bool A = false, bool split = false, typename T>
-CMT_INLINE void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
+KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
 {
     cvec<T, N> v = value;
     if (split)
@@ -262,13 +264,13 @@ inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64,
 }
 
 template <size_t N, size_t stride, typename T, size_t... Indices>
-CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>)
+KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>)
 {
     return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...);
 }
 
 template <size_t N, size_t stride, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base)
 {
     if (stride == 1)
     {
@@ -278,7 +280,7 @@ CMT_INLINE cvec<T, N> cgather(const complex<T>* base)
         return cgather_helper<N, stride, T>(base, csizeseq_t<N>());
 }
 
-CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
+KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
 {
     size_t temp = index;
     index += stride;
@@ -286,7 +288,7 @@ CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t
         index -= size;
     return temp;
 }
-CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t)
+KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t)
 {
     size_t temp = index;
     index += stride;
@@ -294,45 +296,45 @@ CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t)
 }
 
 template <size_t N, typename T, size_t... Indices>
-CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride,
-                                     csizes_t<Indices...>)
+KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride,
+                                        csizes_t<Indices...>)
 {
     return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...);
 }
 
 template <size_t N, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
 {
     return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
 }
 template <size_t N, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t stride)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t stride)
 {
     size_t index = 0;
     return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
 }
 
 template <size_t N, typename T, size_t... Indices>
-CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size,
-                                     csizes_t<Indices...>)
+KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size,
+                                        csizes_t<Indices...>)
 {
     return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...);
 }
 
 template <size_t N, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
 {
     return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>());
 }
 
 template <size_t N, size_t stride, typename T, size_t... Indices>
-CMT_INLINE void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>)
+KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>)
 {
     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
 }
 
 template <size_t N, size_t stride, typename T>
-CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value)
+KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value)
 {
     if (stride == 1)
     {
@@ -345,34 +347,35 @@ CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value)
 }
 
 template <size_t N, typename T, size_t... Indices>
-CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value,
-                                csizes_t<Indices...>)
+KFR_INTRINSIC void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value,
+                                   csizes_t<Indices...>)
 {
     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
 }
 
 template <size_t N, typename T>
-CMT_INLINE void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value)
+KFR_INTRINSIC void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value)
 {
     return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>());
 }
 
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset)
+KFR_INTRINSIC vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset)
 {
     return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>());
 }
 
 template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value)
+KFR_INTRINSIC void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value)
 {
     return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>());
 }
 
 template <typename T>
-KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2,
-                             const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2,
-                             cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, cvec<T, 4>& w7)
+KFR_INTRINSIC void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2,
+                                const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2,
+                                cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6,
+                                cvec<T, 4>& w7)
 {
     cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3));
     cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3));
@@ -389,10 +392,10 @@ KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const c
 }
 
 template <typename T>
-KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2,
-                             const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5,
-                             const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1,
-                             cvec<T, 8>& z2, cvec<T, 8>& z3)
+KFR_INTRINSIC void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2,
+                                const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5,
+                                const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1,
+                                cvec<T, 8>& z2, cvec<T, 8>& z3)
 {
     cvec<T, 16> a = concat(w0, w1, w2, w3);
     cvec<T, 16> b = concat(w4, w5, w6, w7);
@@ -405,7 +408,7 @@ KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const c
 }
 
 template <typename T>
-void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d)
+KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d)
 {
     cvec<T, 4> a0, a1, a2, a3;
     cvec<T, 4> b0, b1, b2, b3;
@@ -423,8 +426,8 @@ void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d)
     d = concat(a3, b3, c3, d3);
 }
 template <typename T>
-void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa,
-                cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd)
+KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa,
+                              cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd)
 {
     cvec<T, 4> a0, a1, a2, a3;
     cvec<T, 4> b0, b1, b2, b3;
@@ -443,35 +446,35 @@ void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, 
 }
 
 template <bool b, typename T>
-constexpr KFR_INTRIN T chsign(T x)
+constexpr KFR_INTRINSIC T chsign(T x)
 {
     return b ? -x : x;
 }
 
 template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false,
           size_t... indices>
-constexpr KFR_INTRIN cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>)
+constexpr KFR_INTRINSIC cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>)
 {
     return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start)))
                                     : cos_using_table<T>(size, (indices / 2 * step + start)))...);
 }
 
 template <typename T, size_t width, size_t... indices>
-constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, size_t start,
-                                                             size_t step)
+constexpr KFR_INTRINSIC cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size,
+                                                                size_t start, size_t step)
 {
     return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start)
                                     : cos_using_table<T>(size, indices / 2 * step + start))...);
 }
 
 template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false>
-constexpr KFR_INTRIN cvec<T, width> fixed_twiddle()
+constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle()
 {
     return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>());
 }
 
 template <typename T, size_t width>
-constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0)
+constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0)
 {
     return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size);
 }
@@ -480,7 +483,7 @@ constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, siz
 // constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>();
 
 template <typename T, size_t N, bool inverse>
-constexpr cvec<T, N> twiddleimagmask()
+constexpr KFR_INTRINSIC cvec<T, N> twiddleimagmask()
 {
     return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1);
 }
@@ -498,7 +501,7 @@ CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x)
 
 template <size_t k, size_t size, bool inverse = false, typename T, size_t width,
           size_t kk = (inverse ? size - k : k) % size>
-KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
+KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
 {
     constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485);
     if (kk == 0)
@@ -540,7 +543,7 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
 }
 
 template <size_t N, typename T>
-KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
+KFR_INTRINSIC void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
 {
     const cvec<T, N> sum = a0 + a1;
     const cvec<T, N> dif = a0 - a1;
@@ -549,15 +552,15 @@ KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N
 }
 
 template <size_t N, typename T>
-KFR_INTRIN void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1)
+KFR_INTRINSIC void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1)
 {
     butterfly2<N>(a0, a1, a0, a1);
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
-                           const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
-                           cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
+                              const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
+                              cvec<T, N>& w2, cvec<T, N>& w3)
 {
     cvec<T, N> sum02, sum13, diff02, diff13;
     cvec<T, N * 2> a01, a23, sum0213, diff0213;
@@ -589,9 +592,9 @@ KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, cons
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
-                           const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
-                           cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRINSIC void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
+                              const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
+                              cvec<T, N>& w2, cvec<T, N>& w3)
 {
     vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3;
     vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3;
@@ -616,11 +619,11 @@ KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
-                           const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
-                           const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1,
-                           cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6,
-                           cvec<T, N>& w7)
+KFR_INTRINSIC void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                              const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+                              const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1,
+                              cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6,
+                              cvec<T, N>& w7)
 {
     cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6;
     butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6);
@@ -642,14 +645,14 @@ KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
-                           cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7)
+KFR_INTRINSIC void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+                              cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7)
 {
     butterfly8<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a0, a1, a2, a3, a4, a5, a6, a7);
 }
 
 template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67)
+KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67)
 {
     cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67;
 
@@ -670,7 +673,7 @@ KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cv
 }
 
 template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(cvec<T, 8>& v8)
+KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8)
 {
     cvec<T, 2> w0, w1, w2, w3;
     split(v8, w0, w1, w2, w3);
@@ -679,7 +682,7 @@ KFR_INTRIN void butterfly8(cvec<T, 8>& v8)
 }
 
 template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly32(cvec<T, 32>& v32)
+KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32)
 {
     cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
     split(v32, w0, w1, w2, w3, w4, w5, w6, w7);
@@ -701,7 +704,7 @@ KFR_INTRIN void butterfly32(cvec<T, 32>& v32)
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(cvec<T, N * 4>& a0123)
+KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123)
 {
     cvec<T, N> a0;
     cvec<T, N> a1;
@@ -713,7 +716,7 @@ KFR_INTRIN void butterfly4(cvec<T, N * 4>& a0123)
 }
 
 template <size_t N, typename T>
-KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01)
+KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01)
 {
     cvec<T, N> a0;
     cvec<T, N> a1;
@@ -723,7 +726,7 @@ KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01)
 }
 
 template <size_t N, bool inverse = false, bool split_format = false, typename T>
-KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
+KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
 {
     if (split_format)
     {
@@ -750,9 +753,9 @@ KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<
 }
 
 template <size_t N, bool inverse = false, bool split_format = false, typename T>
-KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3,
-                                const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3,
-                                cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRINSIC void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3,
+                                   const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3,
+                                   cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
 {
     apply_twiddle<N, inverse, split_format>(a1, tw1, w1);
     apply_twiddle<N, inverse, split_format>(a2, tw2, w2);
@@ -760,31 +763,31 @@ KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, cons
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
-                                cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2,
-                                const cvec<T, N>& tw3)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
+                                   cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2,
+                                   const cvec<T, N>& tw3)
 {
     apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3);
 }
 
 template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]>
-KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
-                                cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2,
-                                const cvec<T, 1>& tw3)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
+                                   cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2,
+                                   const cvec<T, 1>& tw3)
 {
     apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3));
 }
 
 template <size_t N, bool inverse = false, typename T, typename = u8[N - 2]>
-KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
-                                cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2,
-                                cvec<T, N / 2> tw3)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
+                                   cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2,
+                                   cvec<T, N / 2> tw3)
 {
     apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3));
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d)
+KFR_INTRINSIC void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d)
 {
     cvec<T, 4> b0, b1, b2, b3;
     cvec<T, 4> c0, c1, c2, c3;
@@ -812,7 +815,7 @@ KFR_INTRIN void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, c
 }
 
 template <size_t n2, size_t nnstep, size_t N, bool inverse = false, typename T>
-KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
 {
     cvec<T, N> a0;
     cvec<T, N> a1;
@@ -830,7 +833,7 @@ KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
 }
 
 template <bool inverse, bool aligned, typename T>
-KFR_INTRIN void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in)
 {
     cvec<T, 16> w0, w1, w2, w3;
 
@@ -886,7 +889,7 @@ KFR_INTRIN void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
 }
 
 template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly16(cvec<T, 16>& v16)
+KFR_INTRINSIC void butterfly16(cvec<T, 16>& v16)
 {
     butterfly4<4, inverse>(v16);
     apply_twiddles4<0, 4, 4, inverse>(v16);
@@ -895,7 +898,7 @@ KFR_INTRIN void butterfly16(cvec<T, 16>& v16)
 }
 
 template <size_t index, bool inverse = false, typename T>
-KFR_INTRIN void butterfly16_multi_natural(complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly16_multi_natural(complex<T>* out, const complex<T>* in)
 {
     constexpr size_t N = 4;
 
@@ -954,7 +957,7 @@ KFR_INTRIN void butterfly16_multi_natural(complex<T>* out, const complex<T>* in)
 }
 
 template <size_t index, bool inverse = false, typename T>
-KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
 {
     constexpr size_t N = 4;
 
@@ -1011,7 +1014,7 @@ KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
 }
 
 template <size_t n2, size_t nnstep, size_t N, typename T>
-KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1)
+KFR_INTRINSIC void apply_twiddles2(cvec<T, N>& a1)
 {
     cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>();
 
@@ -1026,8 +1029,8 @@ static const cvec<T, N> tw3i1 =
     static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>();
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01,
-                           cvec<T, N>& w02)
+KFR_INTRINSIC void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00,
+                              cvec<T, N>& w01, cvec<T, N>& w02)
 {
 
     const cvec<T, N> sum1 = a01 + a02;
@@ -1043,15 +1046,16 @@ KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2)
+KFR_INTRINSIC void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2)
 {
     butterfly3<N, inverse>(a0, a1, a2, a0, a1, a2);
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
-                           const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, cvec<T, N>& w0,
-                           cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5)
+KFR_INTRINSIC void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                              const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+                              cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4,
+                              cvec<T, N>& w5)
 {
     cvec<T, N* 2> a03 = concat(a0, a3);
     cvec<T, N* 2> a25 = concat(a2, a5);
@@ -1073,8 +1077,8 @@ KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
-                           cvec<T, N>& a5)
+KFR_INTRINSIC void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+                              cvec<T, N>& a5)
 {
     butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5);
 }
@@ -1090,11 +1094,11 @@ const static cvec<T, 1> tw9_4 = { T(-0.93969262078590838405410927732473),
                                   (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) };
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
-                           const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
-                           const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, cvec<T, N>& w0,
-                           cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5,
-                           cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8)
+KFR_INTRINSIC void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                              const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+                              const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
+                              cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4,
+                              cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8)
 {
     cvec<T, N* 3> a012 = concat(a0, a1, a2);
     cvec<T, N* 3> a345 = concat(a3, a4, a5);
@@ -1121,8 +1125,8 @@ KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
-                           cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8)
+KFR_INTRINSIC void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+                              cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8)
 {
     butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8);
 }
@@ -1149,9 +1153,10 @@ static const cvec<T, N> tw7i3 =
     static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>();
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
-                           cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
-                           cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06)
+KFR_INTRINSIC void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+                              cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01,
+                              cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05,
+                              cvec<T, N>& w06)
 {
     const cvec<T, N> sum1 = a01 + a06;
     const cvec<T, N> dif1 = swap<2>(a01 - a06);
@@ -1184,8 +1189,8 @@ KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
-                           cvec<T, N>& a5, cvec<T, N>& a6)
+KFR_INTRINSIC void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+                              cvec<T, N>& a5, cvec<T, N>& a6)
 {
     butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6);
 }
@@ -1226,11 +1231,11 @@ static const cvec<T, N> tw11i5 =
     static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>();
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
-                            cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09,
-                            cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
-                            cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06,
-                            cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10)
+KFR_INTRINSIC void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+                               cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09,
+                               cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
+                               cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06,
+                               cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10)
 {
     const cvec<T, N> sum1 = a01 + a10;
     const cvec<T, N> dif1 = swap<2>(a01 - a10);
@@ -1300,9 +1305,9 @@ const static cvec<T, N> tw5i2 =
     static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>();
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
-                           const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
-                           cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
+KFR_INTRINSIC void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
+                              const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
+                              cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
 {
     const cvec<T, N> sum1 = a01 + a04;
     const cvec<T, N> dif1 = swap<2>(a01 - a04);
@@ -1323,12 +1328,12 @@ KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const c
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
-                            const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
-                            const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
-                            const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2,
-                            cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7,
-                            cvec<T, N>& w8, cvec<T, N>& w9)
+KFR_INTRINSIC void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                               const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+                               const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
+                               const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2,
+                               cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7,
+                               cvec<T, N>& w8, cvec<T, N>& w9)
 {
     cvec<T, N* 2> a05 = concat(a0, a5);
     cvec<T, N* 2> a27 = concat(a2, a7);
@@ -1363,91 +1368,96 @@ KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cv
 }
 
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0,
-                          vec<T, N>& out1)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0,
+                             vec<T, N>& out1)
 {
     butterfly2<N / 2>(in0, in1, out0, out1);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2)
 {
     butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2);
 }
 
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
-                          vec<T, N>& out3)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1,
+                             vec<T, N>& out2, vec<T, N>& out3)
 {
     butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, const vec<T, N>& in4, vec<T, N>& out0, vec<T, N>& out1,
-                          vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+                             vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
+                             vec<T, N>& out4)
 {
     butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, vec<T, N>& out0,
-                          vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+                             const vec<T, N>& in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
+                             vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5)
 {
     butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
-                          const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
-                          vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+                             const vec<T, N>& in5, const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1,
+                             vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
+                             vec<T, N>& out6)
 {
     butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
-                          const vec<T, N>& in6, const vec<T, N>& in7, vec<T, N>& out0, vec<T, N>& out1,
-                          vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
-                          vec<T, N>& out7)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+                             const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+                             vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
+                             vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7)
 {
     butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5,
                                out6, out7);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
-                          const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, vec<T, N>& out0,
-                          vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
-                          vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+                             const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+                             const vec<T, N>& in8, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
+                             vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
+                             vec<T, N>& out7, vec<T, N>& out8)
 {
     butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4,
                                out5, out6, out7, out8);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
-                          const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
-                          const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
-                          vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
-                          vec<T, N>& out8, vec<T, N>& out9)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+                             const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+                             const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1,
+                             vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
+                             vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9)
 {
     butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3,
                                 out4, out5, out6, out7, out8, out9);
 }
 template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
-                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
-                          const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
-                          const vec<T, N>& in9, const vec<T, N>& in10, vec<T, N>& out0, vec<T, N>& out1,
-                          vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
-                          vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+                             const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+                             const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+                             const vec<T, N>& in8, const vec<T, N>& in9, const vec<T, N>& in10,
+                             vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
+                             vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
+                             vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10)
 {
     butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2,
                                 out3, out4, out5, out6, out7, out8, out9, out10);
 }
 template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
-KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
+KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
 {
     vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr));
     if (transposed)
@@ -1456,8 +1466,8 @@ KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec
 }
 
 // Warning: Reads past the end. Use with care
-KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, cvec<f32, 4>& w1,
-                                 cvec<f32, 4>& w2)
+KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0,
+                                    cvec<f32, 4>& w1, cvec<f32, 4>& w2)
 {
     cvec<f32, 4> w3;
     cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 3), cread<4>(ptr + 6), cread<4>(ptr + 9));
@@ -1465,8 +1475,8 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3
     split(v16, w0, w1, w2, w3);
 }
 
-KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, cvec<f32, 4>& w1,
-                                 cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4)
+KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0,
+                                    cvec<f32, 4>& w1, cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4)
 {
     cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 5), cread<4>(ptr + 10), cread<4>(ptr + 15));
     v16               = digitreverse4<2>(v16);
@@ -1475,7 +1485,7 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3
 }
 
 template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
-KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
+KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
 {
     auto temp = concat(args...);
     if (transposed)
@@ -1484,20 +1494,21 @@ KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N
 }
 
 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
-KFR_INTRIN vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle)
+KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle)
 {
     return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1)));
 }
 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
-KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle)
+KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle)
 {
     return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1)));
 }
 
 // Non-final
 template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
-KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>,
-                                 complex<T>* out, const complex<T>* in, const complex<T>* tw, size_t stride)
+KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>,
+                                    cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+                                    const complex<T>* tw, size_t stride)
 {
     carray<cvec<T, width>, radix> inout;
 
@@ -1513,8 +1524,8 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize
 
 // Final
 template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
-KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>,
-                                 complex<T>* out, const complex<T>* in, size_t stride)
+KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>,
+                                    cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride)
 {
     carray<cvec<T, width>, radix> inout;
 
@@ -1527,17 +1538,17 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize
 }
 
 template <size_t width, size_t radix, typename... Args>
-KFR_INTRIN void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args)
+KFR_INTRINSIC void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args)
 {
     butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-KFR_INTRIN void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...)
+KFR_INTRINSIC void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...)
 {
 }
 template <size_t width, typename... Args>
-KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args)
+KFR_INTRINSIC void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args)
 {
     CMT_LOOP_NOUNROLL
     for (; i < count / width * width; i += width)
@@ -1546,7 +1557,7 @@ KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&.
 }
 
 template <size_t width, typename... Args>
-KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
+KFR_INTRINSIC void butterflies(size_t count, csize_t<width>, Args&&... args)
 {
     CMT_ASSUME(count > 0);
     size_t i = 0;
@@ -1554,16 +1565,17 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
 }
 
 template <typename T, bool inverse, typename Tradix, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, Tradix radix, cbool_t<inverse>, complex<T>*,
-                                        const complex<T>*, Tstride, size_t, size_t, const complex<T>*, size_t)
+KFR_INTRINSIC void generic_butterfly_cycle(csize_t<0>, Tradix, cbool_t<inverse>, complex<T>*,
+                                           const complex<T>*, Tstride, size_t, size_t, const complex<T>*,
+                                           size_t)
 {
 }
 
 template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix,
           typename Thalfradixsqr, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out,
-                                        const complex<T>* in, Tstride ostride, Thalfradix halfradix,
-                                        Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i)
+KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out,
+                                           const complex<T>* in, Tstride ostride, Thalfradix halfradix,
+                                           Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i)
 {
     CMT_LOOP_NOUNROLL
     for (; i < halfradix / width * width; i += width)
@@ -1605,19 +1617,19 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<in
 }
 
 template <typename T>
-KFR_SINTRIN vec<T, 2> hcadd(vec<T, 2> value)
+KFR_INTRINSIC vec<T, 2> hcadd(vec<T, 2> value)
 {
     return value;
 }
 template <typename T, size_t N, KFR_ENABLE_IF(N >= 4)>
-KFR_SINTRIN vec<T, 2> hcadd(vec<T, N> value)
+KFR_INTRINSIC vec<T, 2> hcadd(vec<T, N> value)
 {
     return hcadd(low(value) + high(value));
 }
 
 template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>>
-KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
-                                    const complex<T>* twiddle, Tstride ostride = Tstride{})
+KFR_INTRINSIC void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+                                       const complex<T>* twiddle, Tstride ostride = Tstride{})
 {
     CMT_ASSUME(radix > 0);
     {
@@ -1636,8 +1648,7 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* 
         }
         cwrite<1>(out, hcadd(sum) + sums);
     }
-    const auto halfradix     = radix / 2;
-    const auto halfradix_sqr = halfradix * halfradix;
+    const auto halfradix = radix / 2;
     CMT_ASSUME(halfradix > 0);
     size_t i = 0;
 
@@ -1646,9 +1657,9 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* 
 }
 
 template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>>
-KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out,
-                                         const complex<T>* in, const complex<T>* twiddle,
-                                         Tstride ostride = Tstride{})
+KFR_INTRINSIC void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out,
+                                            const complex<T>* in, const complex<T>* twiddle,
+                                            Tstride ostride = Tstride{})
 {
     {
         cvec<T, width> sum = T();
@@ -1676,16 +1687,16 @@ KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, compl
 }
 
 template <typename T, bool inverse, typename Tstride = csize_t<1>>
-KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
-                                  complex<T>* temp, const complex<T>* twiddle, Tstride ostride = {})
+KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+                                     complex<T>*, const complex<T>* twiddle, Tstride ostride = {})
 {
     cswitch(csizes_t<11, 13>(), radix,
             [&](auto radix_) CMT_INLINE_LAMBDA {
-                constexpr size_t width = platform<T>::vector_width;
+                constexpr size_t width = vector_width<T>;
                 spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride);
             },
             [&]() CMT_INLINE_LAMBDA {
-                constexpr size_t width = platform<T>::vector_width;
+                constexpr size_t width = vector_width<T>;
                 generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
             });
 }
@@ -1697,25 +1708,25 @@ template <typename T, size_t N>
 constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T());
 
 template <bool A = false, typename T, size_t N>
-KFR_INTRIN void cbitreverse_write(complex<T>* dest, const vec<T, N>& x)
+KFR_INTRINSIC void cbitreverse_write(complex<T>* dest, const vec<T, N>& x)
 {
     cwrite<N / 2, A>(dest, bitreverse<2>(x));
 }
 
 template <bool A = false, typename T, size_t N>
-KFR_INTRIN void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x)
+KFR_INTRINSIC void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x)
 {
     cwrite<N / 2, A>(dest, digitreverse4<2>(x));
 }
 
 template <size_t N, bool A = false, typename T>
-KFR_INTRIN cvec<T, N> cbitreverse_read(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cbitreverse_read(const complex<T>* src)
 {
     return bitreverse<2>(cread<N, A>(src));
 }
 
 template <size_t N, bool A = false, typename T>
-KFR_INTRIN cvec<T, N> cdigitreverse4_read(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src)
 {
     return digitreverse4<2>(cread<N, A>(src));
 }
@@ -1723,7 +1734,7 @@ KFR_INTRIN cvec<T, N> cdigitreverse4_read(const complex<T>* src)
 #if 1
 
 template <>
-KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src)
+KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src)
 {
     return concat(cread<1>(src + 0), cread<1>(src + 4), cread<1>(src + 8), cread<1>(src + 12),
                   cread<1>(src + 1), cread<1>(src + 5), cread<1>(src + 9), cread<1>(src + 13),
@@ -1731,7 +1742,7 @@ KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>*
                   cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15));
 }
 template <>
-KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x)
+KFR_INTRINSIC void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x)
 {
     cwrite<1>(dest, part<16, 0>(x));
     cwrite<1>(dest + 4, part<16, 1>(x));
@@ -1754,7 +1765,8 @@ KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const v
     cwrite<1>(dest + 15, part<16, 15>(x));
 }
 #endif
-} // namespace internal
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
 
 CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/dft/reference_dft.hpp b/include/kfr/dft/reference_dft.hpp
@@ -25,13 +25,13 @@
  */
 #pragma once
 
-#include "../base/complex.hpp"
-#include "../base/constants.hpp"
 #include "../base/memory.hpp"
-#include "../base/read_write.hpp"
 #include "../base/small_buffer.hpp"
 #include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/constants.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/vec.hpp"
 #include <cmath>
 #include <vector>
 
diff --git a/include/kfr/dsp.hpp b/include/kfr/dsp.hpp
@@ -33,7 +33,6 @@
 #include "dsp/fir_design.hpp"
 #include "dsp/fracdelay.hpp"
 #include "dsp/goertzel.hpp"
-#include "dsp/interpolation.hpp"
 #include "dsp/mixdown.hpp"
 #include "dsp/oscillators.hpp"
 #include "dsp/sample_rate_conversion.hpp"
diff --git a/include/kfr/dsp/biquad.hpp b/include/kfr/dsp/biquad.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup biquad
  *  @{
  */
 /*
@@ -26,13 +26,16 @@
 #pragma once
 
 #include "../base/filter.hpp"
-#include "../base/function.hpp"
-#include "../base/operators.hpp"
 #include "../base/pointer.hpp"
-#include "../base/vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
+#include "../testo/assert.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 enum class biquad_type
 {
@@ -53,17 +56,24 @@ template <typename T>
 struct biquad_params
 {
     template <typename U>
-    constexpr biquad_params(const biquad_params<U>& bq) noexcept
-        : a0(static_cast<T>(bq.a0)), a1(static_cast<T>(bq.a1)), a2(static_cast<T>(bq.a2)),
-          b0(static_cast<T>(bq.b0)), b1(static_cast<T>(bq.b1)), b2(static_cast<T>(bq.b2))
+    constexpr biquad_params(const biquad_params<U>& bq) CMT_NOEXCEPT : a0(static_cast<T>(bq.a0)),
+                                                                       a1(static_cast<T>(bq.a1)),
+                                                                       a2(static_cast<T>(bq.a2)),
+                                                                       b0(static_cast<T>(bq.b0)),
+                                                                       b1(static_cast<T>(bq.b1)),
+                                                                       b2(static_cast<T>(bq.b2))
     {
     }
     constexpr static bool is_pod = true;
 
     static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
-    constexpr biquad_params() noexcept : a0(1), a1(0), a2(0), b0(1), b1(0), b2(0) {}
-    constexpr biquad_params(T a0, T a1, T a2, T b0, T b1, T b2) noexcept
-        : a0(a0), a1(a1), a2(a2), b0(b0), b1(b1), b2(b2)
+    constexpr biquad_params() CMT_NOEXCEPT : a0(1), a1(0), a2(0), b0(1), b1(0), b2(0) {}
+    constexpr biquad_params(T a0, T a1, T a2, T b0, T b1, T b2) CMT_NOEXCEPT : a0(a0),
+                                                                               a1(a1),
+                                                                               a2(a2),
+                                                                               b0(b0),
+                                                                               b1(b1),
+                                                                               b2(b2)
     {
     }
     T a0;
@@ -90,7 +100,7 @@ struct biquad_state
     vec<T, filters> s1;
     vec<T, filters> s2;
     vec<T, filters> out;
-    constexpr biquad_state() noexcept : s1(0), s2(0), out(0) {}
+    constexpr biquad_state() CMT_NOEXCEPT : s1(0), s2(0), out(0) {}
 };
 
 template <typename T, size_t filters, KFR_ARCH_DEP>
@@ -102,8 +112,8 @@ struct biquad_block
     vec<T, filters> b1;
     vec<T, filters> b2;
 
-    constexpr biquad_block() noexcept : a1(0), a2(0), b0(1), b1(0), b2(0) {}
-    CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) noexcept
+    constexpr biquad_block() CMT_NOEXCEPT : a1(0), a2(0), b0(1), b1(0), b2(0) {}
+    CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) CMT_NOEXCEPT
     {
         count = count > filters ? filters : count;
         for (size_t i = 0; i < count; i++)
@@ -125,38 +135,40 @@ struct biquad_block
     }
 
     template <size_t count>
-    constexpr biquad_block(const biquad_params<T> (&bq)[count]) noexcept : biquad_block(bq, count)
+    constexpr biquad_block(const biquad_params<T> (&bq)[count]) CMT_NOEXCEPT : biquad_block(bq, count)
     {
         static_assert(count <= filters, "count > filters");
     }
 };
 
 template <size_t filters, typename T, typename E1, KFR_ARCH_DEP>
-struct expression_biquads_l : public expression_base<E1>
+struct expression_biquads_l : public expression_with_arguments<E1>
 {
     using value_type = T;
 
     expression_biquads_l(const biquad_block<T, filters>& bq, E1&& e1)
-        : expression_base<E1>(std::forward<E1>(e1)), bq(bq)
+        : expression_with_arguments<E1>(std::forward<E1>(e1)), bq(bq)
     {
     }
     template <size_t width>
-    KFR_INTRIN vec<T, width> operator()(cinput_t cinput, size_t index, vec_t<T, width> t) const
+    friend KFR_INTRINSIC vec<T, width> get_elements(const expression_biquads_l& self, cinput_t cinput, size_t index,
+                                      vec_shape<T, width> t)
     {
-        const vec<T, width> in = this->argument_first(cinput, index, t);
+        const vec<T, width> in = self.argument_first(cinput, index, t);
         vec<T, width> out;
 
         CMT_LOOP_UNROLL
         for (size_t i = 0; i < width; i++)
         {
-            state.out = process(bq, state, insertleft(in[i], state.out));
-            out[i]    = state.out[filters - 1];
+            self.state.out = process(self.bq, self.state, insertleft(in[i], self.state.out));
+            out[i]         = self.state.out[filters - 1];
         }
 
         return out;
     }
-    KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state,
-                                        const vec<T, filters>& in)
+    static KFR_MEM_INTRINSIC vec<T, filters> process(const biquad_block<T, filters>& bq,
+                                                     biquad_state<T, filters>& state,
+                                                     const vec<T, filters>& in)
     {
         const vec<T, filters> out = bq.b0 * in + state.s1;
         state.s1                  = state.s2 + bq.b1 * in - bq.a1 * out;
@@ -168,73 +180,74 @@ struct expression_biquads_l : public expression_base<E1>
 };
 
 template <size_t filters, typename T, typename E1, KFR_ARCH_DEP>
-struct expression_biquads : expression_base<E1>
+struct expression_biquads : expression_with_arguments<E1>
 {
     using value_type = T;
 
     expression_biquads(const biquad_block<T, filters>& bq, E1&& e1)
-        : expression_base<E1>(std::forward<E1>(e1)), bq(bq), block_end(0)
+        : expression_with_arguments<E1>(std::forward<E1>(e1)), bq(bq), block_end(0)
     {
     }
 
-    CMT_INLINE void begin_block(cinput_t cinput, size_t size) const
+    void begin_block(cinput_t cinput, size_t size) const
     {
         block_end = size;
         for (size_t i = 0; i < filters - 1; i++)
         {
-            const vec<T, 1> in = i < size ? this->argument_first(cinput, i, vec_t<T, 1>()) : 0;
+            const vec<T, 1> in = i < size ? this->argument_first(cinput, i, vec_shape<T, 1>()) : 0;
             state.out          = process(bq, state, insertleft(in[0], state.out));
         }
     }
-    CMT_INLINE void end_block(cinput_t cinput, size_t) const { state = saved_state; }
+    void end_block(cinput_t, size_t) const { state = saved_state; }
 
     template <size_t width>
-    KFR_INTRIN vec<T, width> operator()(cinput_t cinput, size_t index, vec_t<T, width> t) const
+    friend KFR_INTRINSIC vec<T, width> get_elements(const expression_biquads& self, cinput_t cinput, size_t index,
+                                      vec_shape<T, width> t)
     {
         index += filters - 1;
-        vec<T, width> out;
-        if (index + width <= block_end)
+        vec<T, width> out{};
+        if (index + width <= self.block_end)
         {
-            const vec<T, width> in = this->argument_first(cinput, index, t);
+            const vec<T, width> in = self.argument_first(cinput, index, t);
 
             CMT_LOOP_UNROLL
             for (size_t i = 0; i < width; i++)
             {
-                state.out = process(bq, state, insertleft(in[i], state.out));
-                out[i]    = state.out[filters - 1];
+                self.state.out = process(self.bq, self.state, insertleft(in[i], self.state.out));
+                out[i]         = self.state.out[filters - 1];
             }
-            if (index + width == block_end)
-                saved_state = state;
+            if (index + width == self.block_end)
+                self.saved_state = self.state;
         }
-        else if (index >= block_end)
+        else if (index >= self.block_end)
         {
             CMT_LOOP_UNROLL
             for (size_t i = 0; i < width; i++)
             {
-                state.out = process(bq, state, insertleft(T(0), state.out));
-                out[i]    = state.out[filters - 1];
+                self.state.out = process(self.bq, self.state, insertleft(T(0), self.state.out));
+                out[i]         = self.state.out[filters - 1];
             }
         }
         else
         {
             size_t i = 0;
-            for (; i < std::min(width, block_end - index); i++)
+            for (; i < std::min(width, self.block_end - index); i++)
             {
-                const vec<T, 1> in = this->argument_first(cinput, index + i, vec_t<T, 1>());
-                state.out          = process(bq, state, insertleft(in[0], state.out));
-                out[i]             = state.out[filters - 1];
+                const vec<T, 1> in = self.argument_first(cinput, index + i, vec_shape<T, 1>());
+                self.state.out     = process(self.bq, self.state, insertleft(in[0], self.state.out));
+                out[i]             = self.state.out[filters - 1];
             }
-            saved_state = state;
+            self.saved_state = self.state;
             for (; i < width; i++)
             {
-                state.out = process(bq, state, insertleft(T(0), state.out));
-                out[i]    = state.out[filters - 1];
+                self.state.out = process(self.bq, self.state, insertleft(T(0), self.state.out));
+                out[i]         = self.state.out[filters - 1];
             }
         }
         return out;
     }
-    KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state,
-                                        vec<T, filters> in)
+    static KFR_MEM_INTRINSIC vec<T, filters> process(const biquad_block<T, filters>& bq,
+                                                     biquad_state<T, filters>& state, vec<T, filters> in)
     {
         const vec<T, filters> out = bq.b0 * in + state.s1;
         state.s1                  = state.s2 + bq.b1 * in - bq.a1 * out;
@@ -255,7 +268,7 @@ struct expression_biquads : expression_base<E1>
  * @param e1 Input expression
  */
 template <typename T, typename E1>
-CMT_INLINE internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>& bq, E1&& e1)
+KFR_FUNCTION internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>& bq, E1&& e1)
 {
     const biquad_params<T> bqs[1] = { bq };
     return internal::expression_biquads<1, T, E1>(bqs, std::forward<E1>(e1));
@@ -268,8 +281,8 @@ CMT_INLINE internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>&
  * @note This implementation introduces delay of N - 1 samples, where N is the filter count.
  */
 template <size_t filters, typename T, typename E1>
-CMT_INLINE internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_params<T> (&bq)[filters],
-                                                                   E1&& e1)
+KFR_FUNCTION internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_params<T> (&bq)[filters],
+                                                                     E1&& e1)
 {
     return internal::expression_biquads_l<filters, T, E1>(bq, std::forward<E1>(e1));
 }
@@ -281,7 +294,8 @@ CMT_INLINE internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_
  * @note This implementation has zero latency
  */
 template <size_t filters, typename T, typename E1>
-CMT_INLINE internal::expression_biquads<filters, T, E1> biquad(const biquad_params<T> (&bq)[filters], E1&& e1)
+KFR_FUNCTION internal::expression_biquads<filters, T, E1> biquad(const biquad_params<T> (&bq)[filters],
+                                                                 E1&& e1)
 {
     return internal::expression_biquads<filters, T, E1>(bq, std::forward<E1>(e1));
 }
@@ -292,10 +306,11 @@ CMT_INLINE internal::expression_biquads<filters, T, E1> biquad(const biquad_para
  * @param e1 Input expression
  * @note This implementation has zero latency
  */
-template <typename T, typename E1>
-CMT_INLINE expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count, E1&& e1)
+template <size_t maxfiltercount = 4, typename T, typename E1>
+KFR_FUNCTION expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count, E1&& e1)
 {
-    return cswitch(csizes_t<1, 2, 4, 8, 16, 32, 64>(), next_poweroftwo(count),
+    constexpr csizes_t<1, 2, 4, 8, 16, 32, 64> sizes;
+    return cswitch(cfilter(sizes, sizes <= csize_t<maxfiltercount>{}), next_poweroftwo(count),
                    [&](auto x) {
                        constexpr size_t filters = x;
                        return to_pointer(internal::expression_biquads<filters, T, E1>(
@@ -304,12 +319,12 @@ CMT_INLINE expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count
                    [&] { return to_pointer(zeros<T>()); });
 }
 
-template <typename T>
+template <typename T, size_t maxfiltercount = 4>
 class biquad_filter : public expression_filter<T>
 {
 public:
     biquad_filter(const biquad_params<T>* bq, size_t count)
-        : expression_filter<T>(biquad(bq, count, placeholder<T>()))
+        : expression_filter<T>(biquad<maxfiltercount>(bq, count, placeholder<T>()))
     {
     }
 
@@ -318,4 +333,5 @@ public:
     {
     }
 };
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/biquad_design.hpp b/include/kfr/dsp/biquad_design.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup biquad
  *  @{
  */
 /*
@@ -30,6 +30,8 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 /**
  * @brief Calculates coefficients for the all-pass biquad filter
@@ -38,7 +40,7 @@ namespace kfr
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q)
 {
     const T alpha = std::sin(frequency) / 2.0 * Q;
     const T cs    = std::cos(frequency);
@@ -59,7 +61,7 @@ biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q)
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -79,7 +81,7 @@ biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q)
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -99,7 +101,7 @@ biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q)
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -119,7 +121,7 @@ biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q)
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q)
 {
     const T K    = std::tan(c_pi<T, 1> * frequency);
     const T K2   = K * K;
@@ -140,7 +142,7 @@ biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q)
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> gain)
+KFR_FUNCTION biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> gain)
 {
     biquad_params<T> result;
     const T K  = std::tan(c_pi<T, 1> * frequency);
@@ -177,7 +179,7 @@ biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> g
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain)
+KFR_FUNCTION biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain)
 {
     biquad_params<T> result;
     const T K  = std::tan(c_pi<T, 1> * frequency);
@@ -214,7 +216,7 @@ biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain)
  * @return Biquad filter coefficients
  */
 template <typename T = fbase>
-biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain)
+KFR_FUNCTION biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain)
 {
     biquad_params<T> result;
     const T K  = std::tan(c_pi<T, 1> * frequency);
@@ -243,4 +245,5 @@ biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain)
     }
     return result;
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/dcremove.hpp b/include/kfr/dsp/dcremove.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup biquad
  *  @{
  */
 /*
@@ -30,11 +30,14 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 template <typename E1, typename T = flt_type<value_type_of<E1>>>
-CMT_INLINE internal::expression_biquads<1, T, E1> dcremove(E1&& e1, double cutoff = 0.00025)
+KFR_INTRINSIC internal::expression_biquads<1, T, E1> dcremove(E1&& e1, double cutoff = 0.00025)
 {
     const biquad_params<T> bqs[1] = { biquad_highpass(cutoff, 0.5) };
     return internal::expression_biquads<1, T, E1>(bqs, std::forward<E1>(e1));
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/delay.hpp b/include/kfr/dsp/delay.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
  *  @{
  */
 /*
@@ -30,43 +30,48 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 namespace internal
 {
 template <size_t delay, typename E>
-struct expression_delay : expression_base<E>
+struct expression_delay : expression_with_arguments<E>
 {
     using value_type = value_type_of<E>;
     using T          = value_type;
-    using expression_base<E>::expression_base;
+    using expression_with_arguments<E>::expression_with_arguments;
 
     template <size_t N, KFR_ENABLE_IF(N <= delay)>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+                                  vec_shape<T, N>)
     {
         vec<T, N> out;
-        size_t c = cursor;
-        data.ringbuf_read(c, out);
-        const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>());
-        data.ringbuf_write(cursor, in);
+        size_t c = self.cursor;
+        self.data.ringbuf_read(c, out);
+        const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
+        self.data.ringbuf_write(self.cursor, in);
         return out;
     }
-    vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const
+    friend vec<T, 1> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+                                  vec_shape<T, 1>)
     {
         T out;
-        size_t c = cursor;
-        data.ringbuf_read(c, out);
-        const T in = this->argument_first(cinput, index, vec_t<T, 1>())[0];
-        data.ringbuf_write(cursor, in);
+        size_t c = self.cursor;
+        self.data.ringbuf_read(c, out);
+        const T in = self.argument_first(cinput, index, vec_shape<T, 1>())[0];
+        self.data.ringbuf_write(self.cursor, in);
         return out;
     }
     template <size_t N, KFR_ENABLE_IF(N > delay)>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    friend vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+                                  vec_shape<T, N>)
     {
         vec<T, delay> out;
-        size_t c = cursor;
-        data.ringbuf_read(c, out);
-        const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>());
-        data.ringbuf_write(cursor, slice<N - delay, delay>(in));
+        size_t c = self.cursor;
+        self.data.ringbuf_read(c, out);
+        const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
+        self.data.ringbuf_write(self.cursor, slice<N - delay, delay>(in));
         return concat_and_slice<0, N>(out, in);
     }
 
@@ -75,18 +80,19 @@ struct expression_delay : expression_base<E>
 };
 
 template <typename E>
-struct expression_delay<1, E> : expression_base<E>
+struct expression_delay<1, E> : expression_with_arguments<E>
 {
     using value_type = value_type_of<E>;
     using T          = value_type;
-    using expression_base<E>::expression_base;
+    using expression_with_arguments<E>::expression_with_arguments;
 
     template <size_t N>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+                                  vec_shape<T, N>)
     {
-        const vec<T, N> in  = this->argument_first(cinput, index, vec_t<T, N>());
-        const vec<T, N> out = insertleft(data, in);
-        data                = in[N - 1];
+        const vec<T, N> in  = self.argument_first(cinput, index, vec_shape<T, N>());
+        const vec<T, N> out = insertleft(self.data, in);
+        self.data           = in[N - 1];
         return out;
     }
     mutable value_type data = value_type(0);
@@ -103,9 +109,10 @@ struct expression_delay<1, E> : expression_base<E>
  * @endcode
  */
 template <size_t samples = 1, typename E1>
-CMT_INLINE internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>())
+KFR_INTRINSIC internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>())
 {
     static_assert(samples >= 1 && samples < 1024, "");
     return internal::expression_delay<samples, E1>(std::forward<E1>(e1));
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/ebu.hpp b/include/kfr/dsp/ebu.hpp
@@ -1,3 +1,28 @@
+/** @addtogroup ebu
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
 #pragma once
 
 #include <vector>
@@ -16,15 +41,17 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Winaccessible-base")
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 template <typename T>
-KFR_SINTRIN T energy_to_loudness(T energy)
+KFR_INTRINSIC T energy_to_loudness(T energy)
 {
     return T(10) * log10(energy) - T(0.691);
 }
 
 template <typename T>
-KFR_SINTRIN T loudness_to_energy(T loudness)
+KFR_INTRINSIC T loudness_to_energy(T loudness)
 {
     return exp10((loudness + T(0.691)) * T(0.1));
 }
@@ -88,8 +115,8 @@ public:
     }
 
 private:
-    mutable bool m_integrated_cached;
     mutable T m_integrated;
+    mutable bool m_integrated_cached;
 };
 
 template <typename T>
@@ -98,10 +125,10 @@ struct lra_vec : public univector<T>
 private:
     void compute() const
     {
-        m_range_high            = -70.0;
-        m_range_low             = -70.0;
-        static const T PRC_LOW  = 0.10;
-        static const T PRC_HIGH = 0.95;
+        m_range_high            = -70;
+        m_range_low             = -70;
+        static const T PRC_LOW  = T(0.10);
+        static const T PRC_HIGH = T(0.95);
 
         const T z_total       = mean(*this);
         const T relative_gate = energy_to_loudness(z_total) - 20;
@@ -151,13 +178,13 @@ public:
     }
 
 private:
-    mutable bool m_lra_cached;
     mutable T m_range_low;
     mutable T m_range_high;
+    mutable bool m_lra_cached;
 };
 
 template <typename T>
-KFR_SINTRIN expression_pointer<T> make_kfilter(int samplerate)
+KFR_INTRINSIC expression_pointer<T> make_kfilter(int samplerate)
 {
     const biquad_params<T> bq[] = {
         biquad_highshelf(T(1681.81 / samplerate), T(+4.0)),
@@ -199,8 +226,8 @@ public:
 
     void reset()
     {
-        std::fill(m_short_sum_of_squares.begin(), m_short_sum_of_squares.end(), 0);
-        std::fill(m_momentary_sum_of_squares.begin(), m_momentary_sum_of_squares.end(), 0);
+        std::fill(m_short_sum_of_squares.begin(), m_short_sum_of_squares.end(), T(0));
+        std::fill(m_momentary_sum_of_squares.begin(), m_momentary_sum_of_squares.end(), T(0));
     }
 
     void process_packet(const T* src)
@@ -214,15 +241,15 @@ public:
     Speaker get_speaker() const { return m_speaker; }
 
 private:
+    const int m_sample_rate;
     const Speaker m_speaker;
     const T m_input_gain;
-    const int m_sample_rate;
     const size_t m_packet_size;
     expression_pointer<T> m_kfilter;
-    T m_output_energy_gain;
-    univector<T> m_buffer;
     univector<T> m_short_sum_of_squares;
     univector<T> m_momentary_sum_of_squares;
+    T m_output_energy_gain;
+    univector<T> m_buffer;
     size_t m_buffer_cursor;
     size_t m_short_sum_of_squares_cursor;
     size_t m_momentary_sum_of_squares_cursor;
@@ -239,7 +266,7 @@ public:
     {
         for (Speaker sp : channels)
         {
-            m_channels.emplace_back(sample_rate, sp, packet_size_factor, 1);
+            m_channels.emplace_back(sample_rate, sp, packet_size_factor, T(1));
         }
     }
 
@@ -327,6 +354,7 @@ private:
     lra_vec<T> m_lra_buffer;
 };
 
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
 
 CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
  *  @{
  */
 /*
@@ -30,10 +30,12 @@
 #include "../base/memory.hpp"
 #include "../base/reduce.hpp"
 #include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 template <typename T, size_t Size>
 using fir_taps = univector<T, Size>;
@@ -77,7 +79,7 @@ struct state_holder
     state_holder()                    = delete;
     state_holder(const state_holder&) = default;
     state_holder(state_holder&&)      = default;
-    constexpr state_holder(const T& state) noexcept : s(state) {}
+    constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
     T s;
 };
 
@@ -87,30 +89,32 @@ struct state_holder<T, true>
     state_holder()                    = delete;
     state_holder(const state_holder&) = default;
     state_holder(state_holder&&)      = default;
-    constexpr state_holder(const T& state) noexcept : s(state) {}
+    constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
     const T& s;
 };
 
 template <size_t tapcount, typename T, typename U, typename E1, bool stateless = false, KFR_ARCH_DEP>
-struct expression_short_fir : expression_base<E1>
+struct expression_short_fir : expression_with_arguments<E1>
 {
     using value_type = U;
 
     expression_short_fir(E1&& e1, const short_fir_state<tapcount, T, U>& state)
-        : expression_base<E1>(std::forward<E1>(e1)), state(state)
+        : expression_with_arguments<E1>(std::forward<E1>(e1)), state(state)
     {
     }
 
     template <size_t N>
-    CMT_INLINE vec<U, N> operator()(cinput_t cinput, size_t index, vec_t<U, N> x) const
+    KFR_INTRINSIC friend vec<U, N> get_elements(const expression_short_fir& self, cinput_t cinput,
+                                                    size_t index, vec_shape<U, N> x)
     {
-        vec<U, N> in = this->argument_first(cinput, index, x);
+        vec<U, N> in = self.argument_first(cinput, index, x);
 
-        vec<U, N> out = in * state.s.taps[0];
-        cforeach(csizeseq_t<tapcount - 1, 1>(), [&](auto I) {
-            out = out + concat_and_slice<tapcount - 1 - I, N>(state.s.delayline, in) * state.s.taps[I];
+        vec<U, N> out = in * self.state.s.taps.front();
+        cforeach(csizeseq<tapcount - 1, 1>, [&](auto I) {
+            out = out +
+                  concat_and_slice<tapcount - 1 - I, N>(self.state.s.delayline, in) * self.state.s.taps[I];
         });
-        state.s.delayline = concat_and_slice<N, tapcount - 1>(state.s.delayline, in);
+        self.state.s.delayline = concat_and_slice<N, tapcount - 1>(self.state.s.delayline, in);
 
         return out;
     }
@@ -118,31 +122,33 @@ struct expression_short_fir : expression_base<E1>
 };
 
 template <typename T, typename U, typename E1, bool stateless = false, KFR_ARCH_DEP>
-struct expression_fir : expression_base<E1>
+struct expression_fir : expression_with_arguments<E1>
 {
     using value_type = U;
 
     expression_fir(E1&& e1, const fir_state<T, U>& state)
-        : expression_base<E1>(std::forward<E1>(e1)), state(state)
+        : expression_with_arguments<E1>(std::forward<E1>(e1)), state(state)
     {
     }
 
     template <size_t N>
-    CMT_INLINE vec<U, N> operator()(cinput_t cinput, size_t index, vec_t<U, N> x) const
+    KFR_INTRINSIC friend vec<U, N> get_elements(const expression_fir& self, cinput_t cinput, size_t index,
+                                                    vec_shape<U, N> x)
     {
-        const size_t tapcount = state.s.taps.size();
-        const vec<U, N> input = this->argument_first(cinput, index, x);
+        const size_t tapcount = self.state.s.taps.size();
+        const vec<U, N> input = self.argument_first(cinput, index, x);
 
         vec<U, N> output;
-        size_t cursor = state.s.delayline_cursor;
+        size_t cursor = self.state.s.delayline_cursor;
         CMT_LOOP_NOUNROLL
         for (size_t i = 0; i < N; i++)
         {
-            state.s.delayline.ringbuf_write(cursor, input[i]);
-            output[i] = dotproduct(state.s.taps, state.s.delayline.slice(cursor) /*, tapcount - cursor*/) +
-                        dotproduct(state.s.taps.slice(tapcount - cursor), state.s.delayline /*, cursor*/);
+            self.state.s.delayline.ringbuf_write(cursor, input[i]);
+            output[i] =
+                dotproduct(self.state.s.taps, self.state.s.delayline.slice(cursor) /*, tapcount - cursor*/) +
+                dotproduct(self.state.s.taps.slice(tapcount - cursor), self.state.s.delayline /*, cursor*/);
         }
-        state.s.delayline_cursor = cursor;
+        self.state.s.delayline_cursor = cursor;
         return output;
     }
     state_holder<fir_state<T, U>, stateless> state;
@@ -155,7 +161,7 @@ struct expression_fir : expression_base<E1>
  * @param taps coefficients for the FIR filter
  */
 template <typename T, typename E1, univector_tag Tag>
-CMT_INLINE internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const univector<T, Tag>& taps)
+KFR_INTRINSIC internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const univector<T, Tag>& taps)
 {
     return internal::expression_fir<T, value_type_of<E1>, E1>(std::forward<E1>(e1), taps.ref());
 }
@@ -166,7 +172,7 @@ CMT_INLINE internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const
  * @param e1 an input expression
  */
 template <typename T, typename U, typename E1>
-CMT_INLINE internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, E1&& e1)
+KFR_INTRINSIC internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, E1&& e1)
 {
     return internal::expression_fir<T, U, E1, true>(std::forward<E1>(e1), state);
 }
@@ -178,7 +184,7 @@ CMT_INLINE internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, 
  * @param taps coefficients for the FIR filter
  */
 template <typename T, size_t TapCount, typename E1>
-CMT_INLINE internal::expression_short_fir<next_poweroftwo(TapCount), T, value_type_of<E1>, E1> short_fir(
+KFR_INTRINSIC internal::expression_short_fir<next_poweroftwo(TapCount), T, value_type_of<E1>, E1> short_fir(
     E1&& e1, const univector<T, TapCount>& taps)
 {
     static_assert(TapCount >= 2 && TapCount <= 32, "Use short_fir only for small FIR filters");
@@ -214,4 +220,5 @@ protected:
 private:
     fir_state<T, U> state;
 };
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/fir_design.hpp b/include/kfr/dsp/fir_design.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
  *  @{
  */
 /*
@@ -25,13 +25,15 @@
  */
 #pragma once
 
-#include "../base/sin_cos.hpp"
+#include "../math/sin_cos.hpp"
 #include "fir.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
-namespace intrinsics
+namespace internal
 {
 template <typename T>
 void fir_lowpass(univector_ref<T> taps, T cutoff, const expression_pointer<T>& window, bool normalize = true)
@@ -115,11 +117,11 @@ void fir_bandstop(univector_ref<T> taps, T frequency1, T frequency2, const expre
         taps           = taps * invsum;
     }
 }
-} // namespace intrinsics
-KFR_I_FN(fir_lowpass)
-KFR_I_FN(fir_highpass)
-KFR_I_FN(fir_bandpass)
-KFR_I_FN(fir_bandstop)
+} // namespace internal
+KFR_I_FN_FULL(fir_lowpass, internal::fir_lowpass)
+KFR_I_FN_FULL(fir_highpass, internal::fir_highpass)
+KFR_I_FN_FULL(fir_bandpass, internal::fir_bandpass)
+KFR_I_FN_FULL(fir_bandstop, internal::fir_bandstop)
 
 /**
  * @brief Calculates coefficients for the low-pass FIR filter
@@ -129,10 +131,10 @@ KFR_I_FN(fir_bandstop)
  * @param normalize true for normalized coefficients
  */
 template <typename T, univector_tag Tag>
-CMT_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
-                            bool normalize = true)
+KFR_INTRINSIC void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff,
+                               const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_lowpass(taps.slice(), cutoff, window, normalize);
+    return internal::fir_lowpass(taps.slice(), cutoff, window, normalize);
 }
 
 /**
@@ -143,10 +145,10 @@ CMT_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const e
  * @param normalize true for normalized coefficients
  */
 template <typename T, univector_tag Tag>
-CMT_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
-                             bool normalize = true)
+KFR_INTRINSIC void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff,
+                                const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_highpass(taps.slice(), cutoff, window, normalize);
+    return internal::fir_highpass(taps.slice(), cutoff, window, normalize);
 }
 
 /**
@@ -158,10 +160,10 @@ CMT_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const 
  * @param normalize true for normalized coefficients
  */
 template <typename T, univector_tag Tag>
-CMT_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
-                             const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
+                                const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize);
+    return internal::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize);
 }
 
 /**
@@ -173,49 +175,50 @@ CMT_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, id
  * @param normalize true for normalized coefficients
  */
 template <typename T, univector_tag Tag>
-CMT_INLINE void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
-                             const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
+                                const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize);
+    return internal::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize);
 }
 
 /**
  * @copydoc kfr::fir_lowpass
  */
 template <typename T>
-CMT_INLINE void fir_lowpass(const univector_ref<T>& taps, identity<T> cutoff,
-                            const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_lowpass(const univector_ref<T>& taps, identity<T> cutoff,
+                               const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_lowpass(taps, cutoff, window, normalize);
+    return internal::fir_lowpass(taps, cutoff, window, normalize);
 }
 
 /**
  * @copydoc kfr::fir_highpass
  */
 template <typename T>
-CMT_INLINE void fir_highpass(const univector_ref<T>& taps, identity<T> cutoff,
-                             const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_highpass(const univector_ref<T>& taps, identity<T> cutoff,
+                                const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_highpass(taps, cutoff, window, normalize);
+    return internal::fir_highpass(taps, cutoff, window, normalize);
 }
 
 /**
  * @copydoc kfr::fir_bandpass
  */
 template <typename T>
-CMT_INLINE void fir_bandpass(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
-                             const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandpass(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
+                                const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_bandpass(taps, frequency1, frequency2, window, normalize);
+    return internal::fir_bandpass(taps, frequency1, frequency2, window, normalize);
 }
 
 /**
  * @copydoc kfr::fir_bandstop
  */
 template <typename T>
-CMT_INLINE void fir_bandstop(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
-                             const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandstop(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
+                                const expression_pointer<T>& window, bool normalize = true)
 {
-    return intrinsics::fir_bandstop(taps, frequency1, frequency2, window, normalize);
+    return internal::fir_bandstop(taps, frequency1, frequency2, window, normalize);
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/fracdelay.hpp b/include/kfr/dsp/fracdelay.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
  *  @{
  */
 /*
@@ -30,12 +30,16 @@
 namespace kfr
 {
 
+inline namespace CMT_ARCH_NAME
+{
+
 template <typename T, typename E1>
-CMT_INLINE internal::expression_short_fir<2, T, value_type_of<E1>, E1> fracdelay(E1&& e1, T delay)
+KFR_INTRINSIC internal::expression_short_fir<2, T, value_type_of<E1>, E1> fracdelay(E1&& e1, T delay)
 {
     if (delay < 0)
         delay = 0;
     univector<T, 2> taps({ 1 - delay, delay });
     return internal::expression_short_fir<2, T, value_type_of<E1>, E1>(std::forward<E1>(e1), taps);
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/goertzel.hpp b/include/kfr/dsp/goertzel.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
  *  @{
  */
 /*
@@ -26,12 +26,15 @@
 #pragma once
 
 #include "../base/basic_expressions.hpp"
-#include "../base/complex.hpp"
-#include "../base/sin_cos.hpp"
-#include "../base/vec.hpp"
+#include "../math/sin_cos.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
 namespace internal
 {
 
@@ -48,7 +51,7 @@ struct expression_goertzel : output_expression
         result.imag(q2 * sin(omega));
     }
     template <typename U, size_t N>
-    CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+    KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<U, N>& x)
     {
         vec<T, N> in = x;
         CMT_LOOP_UNROLL
@@ -85,7 +88,7 @@ struct expression_parallel_goertzel : output_expression
         }
     }
     template <typename U, size_t N>
-    CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+    KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<U, N>& x)
     {
         const vec<T, N> in = x;
         CMT_LOOP_UNROLL
@@ -103,18 +106,19 @@ struct expression_parallel_goertzel : output_expression
     vec<T, width> q1;
     vec<T, width> q2;
 };
-}; // namespace internal
+} // namespace internal
 
 template <typename T>
-KFR_SINTRIN internal::expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega)
+KFR_INTRINSIC internal::expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega)
 {
     return internal::expression_goertzel<T>(result, omega);
 }
 
 template <typename T, size_t width>
-KFR_SINTRIN internal::expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width],
-                                                                      const T (&omega)[width])
+KFR_INTRINSIC internal::expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width],
+                                                                        const T (&omega)[width])
 {
     return internal::expression_parallel_goertzel<T, width>(result, read<width>(omega));
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/interpolation.hpp b/include/kfr/dsp/interpolation.hpp
@@ -1,72 +0,0 @@
-/** @addtogroup dsp
- *  @{
- */
-/*
-  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
-  This file is part of KFR
-
-  KFR is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  KFR is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with KFR.
-
-  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
-  Buying a commercial license is mandatory as soon as you develop commercial activities without
-  disclosing the source code of your own applications.
-  See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../base/select.hpp"
-#include "../base/sin_cos.hpp"
-#include "../base/vec.hpp"
-
-namespace kfr
-{
-
-template <typename T, typename M>
-KFR_FUNC T nearest(M mu, T x1, T x2)
-{
-    return select(mu < M(0.5), x1, x2);
-}
-
-template <typename T, typename M>
-KFR_FUNC T linear(M mu, T x1, T x2)
-{
-    return mix(mu, x1, x2);
-}
-
-template <typename T, typename M>
-KFR_FUNC T cosine(M mu, T x1, T x2)
-{
-    return mix((M(1) - fastcos(mu * c_pi<T>)) * M(0.5), x1, x2);
-}
-
-template <typename T, typename M>
-KFR_FUNC T cubic(M mu, T x0, T x1, T x2, T x3)
-{
-    const T a0 = x3 - x2 - x0 + x1;
-    const T a1 = x0 - x1 - a0;
-    const T a2 = x2 - x0;
-    const T a3 = x1;
-    return horner(mu, a0, a1, a2, a3);
-}
-
-template <typename T, typename M>
-KFR_FUNC T catmullrom(M mu, T x0, T x1, T x2, T x3)
-{
-    const T a0 = T(0.5) * (x3 - x0) - T(1.5) * (x2 - x1);
-    const T a1 = x0 - T(2.5) * x1 + T(2) * x2 - T(0.5) * x3;
-    const T a2 = T(0.5) * (x2 - x0);
-    const T a3 = x1;
-    return horner(mu, a0, a1, a2, a3);
-}
-} // namespace kfr
diff --git a/include/kfr/dsp/mixdown.hpp b/include/kfr/dsp/mixdown.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
  *  @{
  */
 /*
@@ -29,6 +29,9 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
 /**
  * @brief Returns template expression that returns the sum of all the inputs
  */
@@ -43,12 +46,12 @@ namespace internal
 struct stereo_matrix
 {
     template <typename T, size_t N>
-    CMT_INLINE vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const
+    KFR_MEM_INTRINSIC vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const
     {
-        return process(x, csizeseq_t<N>());
+        return process(x, csizeseq<N>);
     }
     template <typename T, size_t N, size_t... indices>
-    CMT_INLINE vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const
+    KFR_MEM_INTRINSIC vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const
     {
         return vec<vec<T, 2>, N>(hadd(transpose(x[indices] * matrix))...);
     }
@@ -79,4 +82,5 @@ Result mixdown_stereo(Left&& left, Right&& right, const f64x2x2& matrix)
     return Result(internal::stereo_matrix{ matrix },
                   pack(std::forward<Left>(left), std::forward<Right>(right)));
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/oscillators.hpp b/include/kfr/dsp/oscillators.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup oscillators
  *  @{
  */
 /*
@@ -26,19 +26,21 @@
 #pragma once
 
 #include "../base/basic_expressions.hpp"
-#include "../base/sin_cos.hpp"
+#include "../math/sin_cos.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 template <typename T = fbase>
-KFR_FUNC static auto phasor(identity<T> frequency, identity<T> sample_rate, identity<T> phase = 0)
+KFR_FUNCTION static auto phasor(identity<T> frequency, identity<T> sample_rate, identity<T> phase = 0)
 {
     return fract(counter(phase, frequency / sample_rate));
 }
 
 template <typename T = fbase>
-KFR_FUNC static auto phasor(identity<T> frequency)
+KFR_FUNCTION static auto phasor(identity<T> frequency)
 {
     return phasor(frequency, 1, 0);
 }
@@ -46,76 +48,76 @@ KFR_FUNC static auto phasor(identity<T> frequency)
 namespace intrinsics
 {
 template <typename T>
-KFR_FUNC T rawsine(const T& x)
+KFR_INTRINSIC T rawsine(const T& x)
 {
     return intrinsics::fastsin(x * constants<T>::pi_s(2));
 }
 template <typename T>
-KFR_FUNC T sinenorm(const T& x)
+KFR_INTRINSIC T sinenorm(const T& x)
 {
     return intrinsics::rawsine(fract(x));
 }
 template <typename T>
-KFR_FUNC T sine(const T& x)
+KFR_INTRINSIC T sine(const T& x)
 {
     return intrinsics::sinenorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_FUNC T rawsquare(const T& x)
+KFR_INTRINSIC T rawsquare(const T& x)
 {
     return select(x < T(0.5), T(1), -T(1));
 }
 template <typename T>
-KFR_FUNC T squarenorm(const T& x)
+KFR_INTRINSIC T squarenorm(const T& x)
 {
     return intrinsics::rawsquare(fract(x));
 }
 template <typename T>
-KFR_FUNC T square(const T& x)
+KFR_INTRINSIC T square(const T& x)
 {
     return intrinsics::squarenorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_FUNC T rawsawtooth(const T& x)
+KFR_INTRINSIC T rawsawtooth(const T& x)
 {
     return T(1) - 2 * x;
 }
 template <typename T>
-KFR_FUNC T sawtoothnorm(const T& x)
+KFR_INTRINSIC T sawtoothnorm(const T& x)
 {
     return intrinsics::rawsawtooth(fract(x));
 }
 template <typename T>
-KFR_FUNC T sawtooth(const T& x)
+KFR_INTRINSIC T sawtooth(const T& x)
 {
     return intrinsics::sawtoothnorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_FUNC T isawtoothnorm(const T& x)
+KFR_INTRINSIC T isawtoothnorm(const T& x)
 {
     return T(-1) + 2 * fract(x + 0.5);
 }
 template <typename T>
-KFR_FUNC T isawtooth(const T& x)
+KFR_INTRINSIC T isawtooth(const T& x)
 {
     return intrinsics::isawtoothnorm(constants<T>::recip_pi_s(1, 2) * x);
 }
 
 template <typename T>
-KFR_FUNC T rawtriangle(const T& x)
+KFR_INTRINSIC T rawtriangle(const T& x)
 {
     return 1 - abs(4 * x - 2);
 }
 template <typename T>
-KFR_FUNC T trianglenorm(const T& x)
+KFR_INTRINSIC T trianglenorm(const T& x)
 {
     return intrinsics::rawtriangle(fract(x + 0.25));
 }
 template <typename T>
-KFR_FUNC T triangle(const T& x)
+KFR_INTRINSIC T triangle(const T& x)
 {
     return intrinsics::trianglenorm(constants<T>::recip_pi_s(1, 2) * x);
 }
@@ -136,143 +138,145 @@ KFR_I_FN(isawtooth)
 KFR_I_FN(isawtoothnorm)
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawsine(const T1& x)
+KFR_FUNCTION T1 rawsine(const T1& x)
 {
     return intrinsics::rawsine(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawsine, E1> rawsine(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawsine, E1> rawsine(E1&& x)
 {
     return { fn::rawsine(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sine(const T1& x)
+KFR_FUNCTION T1 sine(const T1& x)
 {
     return intrinsics::sine(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sine, E1> sine(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sine, E1> sine(E1&& x)
 {
     return { fn::sine(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sinenorm(const T1& x)
+KFR_FUNCTION T1 sinenorm(const T1& x)
 {
     return intrinsics::sinenorm(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinenorm, E1> sinenorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sinenorm, E1> sinenorm(E1&& x)
 {
     return { fn::sinenorm(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawsquare(const T1& x)
+KFR_FUNCTION T1 rawsquare(const T1& x)
 {
     return intrinsics::rawsquare(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawsquare, E1> rawsquare(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawsquare, E1> rawsquare(E1&& x)
 {
     return { fn::rawsquare(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 square(const T1& x)
+KFR_FUNCTION T1 square(const T1& x)
 {
     return intrinsics::square(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::square, E1> square(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::square, E1> square(E1&& x)
 {
     return { fn::square(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 squarenorm(const T1& x)
+KFR_FUNCTION T1 squarenorm(const T1& x)
 {
     return intrinsics::squarenorm(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::squarenorm, E1> squarenorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::squarenorm, E1> squarenorm(E1&& x)
 {
     return { fn::squarenorm(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawtriangle(const T1& x)
+KFR_FUNCTION T1 rawtriangle(const T1& x)
 {
     return intrinsics::rawtriangle(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawtriangle, E1> rawtriangle(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawtriangle, E1> rawtriangle(E1&& x)
 {
     return { fn::rawtriangle(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 triangle(const T1& x)
+KFR_FUNCTION T1 triangle(const T1& x)
 {
     return intrinsics::triangle(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::triangle, E1> triangle(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::triangle, E1> triangle(E1&& x)
 {
     return { fn::triangle(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 trianglenorm(const T1& x)
+KFR_FUNCTION T1 trianglenorm(const T1& x)
 {
     return intrinsics::trianglenorm(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::trianglenorm, E1> trianglenorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::trianglenorm, E1> trianglenorm(E1&& x)
 {
     return { fn::trianglenorm(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawsawtooth(const T1& x)
+KFR_FUNCTION T1 rawsawtooth(const T1& x)
 {
     return intrinsics::rawsawtooth(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawsawtooth, E1> rawsawtooth(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawsawtooth, E1> rawsawtooth(E1&& x)
 {
     return { fn::rawsawtooth(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sawtooth(const T1& x)
+KFR_FUNCTION T1 sawtooth(const T1& x)
 {
     return intrinsics::sawtooth(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sawtooth, E1> sawtooth(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sawtooth, E1> sawtooth(E1&& x)
 {
     return { fn::sawtooth(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sawtoothnorm(const T1& x)
+KFR_FUNCTION T1 sawtoothnorm(const T1& x)
 {
     return intrinsics::sawtoothnorm(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sawtoothnorm, E1> sawtoothnorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sawtoothnorm, E1> sawtoothnorm(E1&& x)
 {
     return { fn::sawtoothnorm(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 isawtooth(const T1& x)
+KFR_FUNCTION T1 isawtooth(const T1& x)
 {
     return intrinsics::isawtooth(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::isawtooth, E1> isawtooth(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::isawtooth, E1> isawtooth(E1&& x)
 {
     return { fn::isawtooth(), std::forward<E1>(x) };
 }
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 isawtoothnorm(const T1& x)
+KFR_FUNCTION T1 isawtoothnorm(const T1& x)
 {
     return intrinsics::isawtoothnorm(x);
 }
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::isawtoothnorm, E1> isawtoothnorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::isawtoothnorm, E1> isawtoothnorm(E1&& x)
 {
     return { fn::isawtoothnorm(), std::forward<E1>(x) };
 }
+} // namespace CMT_ARCH_NAME
+
 } // namespace kfr
diff --git a/include/kfr/dsp/sample_rate_conversion.hpp b/include/kfr/dsp/sample_rate_conversion.hpp
@@ -25,14 +25,17 @@
  */
 #pragma once
 
-#include "../base/function.hpp"
 #include "../base/memory.hpp"
 #include "../base/reduce.hpp"
-#include "../base/vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/vec.hpp"
 #include "window.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
 enum class sample_rate_conversion_quality : int
 {
     draft   = 4,
@@ -52,32 +55,32 @@ struct samplerate_converter
     using ftype = subtype<T>;
 
 private:
-    KFR_INTRIN ftype window(ftype n) const
+    KFR_MEM_INTRINSIC ftype window(ftype n) const
     {
         return modzerobessel(kaiser_beta * sqrt(1 - sqr(2 * n - 1))) * reciprocal(modzerobessel(kaiser_beta));
     }
-    KFR_INTRIN ftype sidelobe_att() const { return kaiser_beta / 0.1102 + 8.7; }
-    KFR_INTRIN ftype transition_width() const { return (sidelobe_att() - 8) / (depth - 1) / 2.285; }
+    KFR_MEM_INTRINSIC ftype sidelobe_att() const { return kaiser_beta / 0.1102 + 8.7; }
+    KFR_MEM_INTRINSIC ftype transition_width() const { return (sidelobe_att() - 8) / (depth - 1) / 2.285; }
 
 public:
-    static KFR_INTRIN size_t filter_order(sample_rate_conversion_quality quality)
+    static KFR_MEM_INTRINSIC size_t filter_order(sample_rate_conversion_quality quality)
     {
-        return 1 << (static_cast<int>(quality) + 1);
+        return size_t(1) << (static_cast<int>(quality) + 1);
     }
 
     /// @brief Returns sidelobe attenuation for the given quality (in dB)
-    static KFR_INTRIN ftype sidelobe_attenuation(sample_rate_conversion_quality quality)
+    static KFR_MEM_INTRINSIC ftype sidelobe_attenuation(sample_rate_conversion_quality quality)
     {
         return (static_cast<int>(quality) - 3) * ftype(20);
     }
 
     /// @brief Returns transition width for the given quality (in rad)
-    static KFR_INTRIN ftype transition_width(sample_rate_conversion_quality quality)
+    static KFR_MEM_INTRINSIC ftype transition_width(sample_rate_conversion_quality quality)
     {
         return (sidelobe_attenuation(quality) - 8) / (filter_order(quality) - 1) / ftype(2.285);
     }
 
-    static KFR_INTRIN ftype window_param(sample_rate_conversion_quality quality)
+    static KFR_MEM_INTRINSIC ftype window_param(sample_rate_conversion_quality quality)
     {
         const ftype att = sidelobe_attenuation(quality);
         if (att > 50)
@@ -112,7 +115,8 @@ public:
 
         for (itype j = 0, jj = 0; j < taps; j++)
         {
-            filter[size_t(j)] = sinc((jj - halftaps) * cutoff * c_pi<ftype, 2>) * window(ftype(jj) / ftype(taps - 1));
+            filter[size_t(j)] =
+                sinc((jj - halftaps) * cutoff * c_pi<ftype, 2>) * window(ftype(jj) / ftype(taps - 1));
             jj += size_t(interpolation_factor);
             if (jj >= taps)
                 jj = jj - taps + 1;
@@ -122,25 +126,31 @@ public:
         filter    = filter * s;
     }
 
-    itype input_position_to_intermediate(itype in_pos) const { return in_pos * interpolation_factor; }
-    itype output_position_to_intermediate(itype out_pos) const { return out_pos * decimation_factor; }
+    KFR_MEM_INTRINSIC itype input_position_to_intermediate(itype in_pos) const
+    {
+        return in_pos * interpolation_factor;
+    }
+    KFR_MEM_INTRINSIC itype output_position_to_intermediate(itype out_pos) const
+    {
+        return out_pos * decimation_factor;
+    }
 
-    itype input_position_to_output(itype in_pos) const
+    KFR_MEM_INTRINSIC itype input_position_to_output(itype in_pos) const
     {
         return floor_div(input_position_to_intermediate(in_pos), decimation_factor).quot;
     }
-    itype output_position_to_input(itype out_pos) const
+    KFR_MEM_INTRINSIC itype output_position_to_input(itype out_pos) const
     {
         return floor_div(output_position_to_intermediate(out_pos), interpolation_factor).quot;
     }
 
-    itype output_size_for_input(itype input_size) const
+    KFR_MEM_INTRINSIC itype output_size_for_input(itype input_size) const
     {
         return input_position_to_output(input_position + input_size - 1) -
                input_position_to_output(input_position - 1);
     }
 
-    itype input_size_for_output(itype output_size) const
+    KFR_MEM_INTRINSIC itype input_size_for_output(itype output_size) const
     {
         return output_position_to_input(output_position + output_size - 1) -
                output_position_to_input(output_position - 1);
@@ -183,7 +193,6 @@ public:
             const std::lldiv_t input_pos =
                 floor_div(intermediate_start + interpolation_factor - 1, interpolation_factor);
             const itype input_start        = input_pos.quot; // first input sample
-            const itype input_end          = input_start + depth;
             const itype tap_start          = interpolation_factor - 1 - input_pos.rem;
             const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(tap_start * depth));
 
@@ -219,8 +228,8 @@ public:
 
         return required_input_size;
     }
-    double get_fractional_delay() const { return (taps - 1) * 0.5 / decimation_factor; }
-    size_t get_delay() const { return static_cast<size_t>(get_fractional_delay()); }
+    KFR_MEM_INTRINSIC double get_fractional_delay() const { return (taps - 1) * 0.5 / decimation_factor; }
+    KFR_MEM_INTRINSIC size_t get_delay() const { return static_cast<size_t>(get_fractional_delay()); }
 
     ftype kaiser_beta;
     itype depth;
@@ -244,130 +253,140 @@ template <size_t factor, size_t offset, typename E>
 struct expression_downsample;
 
 template <typename E>
-struct expression_upsample<2, E> : expression_base<E>
+struct expression_upsample<2, E> : expression_with_arguments<E>
 {
-    using expression_base<E>::expression_base;
+    using expression_with_arguments<E>::expression_with_arguments;
     using value_type = value_type_of<E>;
     using T          = value_type;
 
-    size_t size() const noexcept { return expression_base<E>::size() * 2; }
+    KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() * 2; }
 
     template <size_t N>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_upsample& self, cinput_t cinput,
+                                                size_t index, vec_shape<T, N>)
     {
-        const vec<T, N / 2> x = this->argument_first(cinput, index / 2, vec_t<T, N / 2>());
+        const vec<T, N / 2> x = self.argument_first(cinput, index / 2, vec_shape<T, N / 2>());
         return interleave(x, zerovector(x));
     }
-    vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const
+    KFR_INTRINSIC friend vec<T, 1> get_elements(const expression_upsample& self, cinput_t cinput,
+                                                size_t index, vec_shape<T, 1>)
     {
         if (index & 1)
             return 0;
         else
-            return this->argument_first(cinput, index / 2, vec_t<T, 1>());
+            return self.argument_first(cinput, index / 2, vec_shape<T, 1>());
     }
 };
 
 template <typename E>
-struct expression_upsample<4, E> : expression_base<E>
+struct expression_upsample<4, E> : expression_with_arguments<E>
 {
-    using expression_base<E>::expression_base;
+    using expression_with_arguments<E>::expression_with_arguments;
     using value_type = value_type_of<E>;
     using T          = value_type;
 
-    size_t size() const noexcept { return expression_base<E>::size() * 4; }
+    KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() * 4; }
 
     template <size_t N>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_upsample& self, cinput_t cinput,
+                                                size_t index, vec_shape<T, N>) CMT_NOEXCEPT
     {
-        const vec<T, N / 4> x  = this->argument_first(cinput, index / 4, vec_t<T, N / 4>());
+        const vec<T, N / 4> x  = self.argument_first(cinput, index / 4, vec_shape<T, N / 4>());
         const vec<T, N / 2> xx = interleave(x, zerovector(x));
         return interleave(xx, zerovector(xx));
     }
-    vec<T, 2> operator()(cinput_t cinput, size_t index, vec_t<T, 2>) const
+    KFR_INTRINSIC friend vec<T, 2> get_elements(const expression_upsample& self, cinput_t cinput,
+                                                size_t index, vec_shape<T, 2>) CMT_NOEXCEPT
     {
         switch (index & 3)
         {
         case 0:
-            return interleave(this->argument_first(cinput, index / 4, vec_t<T, 1>()), zerovector<T, 1>());
+            return interleave(self.argument_first(cinput, index / 4, vec_shape<T, 1>()), zerovector<T, 1>());
         case 3:
-            return interleave(zerovector<T, 1>(), this->argument_first(cinput, index / 4, vec_t<T, 1>()));
+            return interleave(zerovector<T, 1>(), self.argument_first(cinput, index / 4, vec_shape<T, 1>()));
         default:
             return 0;
         }
     }
-    vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const
+    KFR_INTRINSIC friend vec<T, 1> get_elements(const expression_upsample& self, cinput_t cinput,
+                                                size_t index, vec_shape<T, 1>) CMT_NOEXCEPT
     {
         if (index & 3)
             return 0;
         else
-            return this->argument_first(cinput, index / 4, vec_t<T, 1>());
+            return self.argument_first(cinput, index / 4, vec_shape<T, 1>());
     }
 };
 
 template <typename E, size_t offset>
-struct expression_downsample<2, offset, E> : expression_base<E>
+struct expression_downsample<2, offset, E> : expression_with_arguments<E>
 {
-    using expression_base<E>::expression_base;
+    using expression_with_arguments<E>::expression_with_arguments;
     using value_type = value_type_of<E>;
     using T          = value_type;
 
-    size_t size() const noexcept { return expression_base<E>::size() / 2; }
+    KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() / 2; }
 
     template <size_t N>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_downsample& self, cinput_t cinput,
+                                                size_t index, vec_shape<T, N>) CMT_NOEXCEPT
     {
-        const vec<T, N* 2> x = this->argument_first(cinput, index * 2, vec_t<T, N * 2>());
-        return x.shuffle(csizeseq_t<N, offset, 2>());
+        const vec<T, N* 2> x = self.argument_first(cinput, index * 2, vec_shape<T, N * 2>());
+        return x.shuffle(csizeseq<N, offset, 2>);
     }
 };
 
 template <typename E, size_t offset>
-struct expression_downsample<4, offset, E> : expression_base<E>
+struct expression_downsample<4, offset, E> : expression_with_arguments<E>
 {
-    using expression_base<E>::expression_base;
+    using expression_with_arguments<E>::expression_with_arguments;
     using value_type = value_type_of<E>;
     using T          = value_type;
 
-    size_t size() const noexcept { return expression_base<E>::size() / 4; }
+    KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() / 4; }
 
     template <size_t N>
-    vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_downsample& self, cinput_t cinput,
+                                                size_t index, vec_shape<T, N>) CMT_NOEXCEPT
     {
-        const vec<T, N* 4> x = this->argument_first(cinput, index * 4, vec_t<T, N * 4>());
-        return x.shuffle(csizeseq_t<N, offset, 4>());
+        const vec<T, N* 4> x = self.argument_first(cinput, index * 4, vec_shape<T, N * 4>());
+        return x.shuffle(csizeseq<N, offset, 4>);
     }
 };
 } // namespace internal
 
 template <typename E1, size_t offset = 0>
-CMT_INLINE internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, csize_t<offset> = csize_t<0>())
+KFR_FUNCTION internal::expression_downsample<2, offset, E1> downsample2(E1&& e1,
+                                                                        csize_t<offset> = csize_t<0>())
 {
     return internal::expression_downsample<2, offset, E1>(std::forward<E1>(e1));
 }
 
 template <typename E1, size_t offset = 0>
-CMT_INLINE internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, csize_t<offset> = csize_t<0>())
+KFR_FUNCTION internal::expression_downsample<4, offset, E1> downsample4(E1&& e1,
+                                                                        csize_t<offset> = csize_t<0>())
 {
     return internal::expression_downsample<4, offset, E1>(std::forward<E1>(e1));
 }
 
 template <typename E1>
-CMT_INLINE internal::expression_upsample<2, E1> upsample2(E1&& e1)
+KFR_FUNCTION internal::expression_upsample<2, E1> upsample2(E1&& e1)
 {
     return internal::expression_upsample<2, E1>(std::forward<E1>(e1));
 }
 
 template <typename E1>
-CMT_INLINE internal::expression_upsample<4, E1> upsample4(E1&& e1)
+KFR_FUNCTION internal::expression_upsample<4, E1> upsample4(E1&& e1)
 {
     return internal::expression_upsample<4, E1>(std::forward<E1>(e1));
 }
 
 template <typename T = fbase>
-inline samplerate_converter<T> sample_rate_converter(sample_rate_conversion_quality quality,
-                                                     size_t interpolation_factor, size_t decimation_factor,
-                                                     subtype<T> scale  = subtype<T>(1),
-                                                     subtype<T> cutoff = 0.5f)
+KFR_FUNCTION samplerate_converter<T> sample_rate_converter(sample_rate_conversion_quality quality,
+                                                           size_t interpolation_factor,
+                                                           size_t decimation_factor,
+                                                           subtype<T> scale  = subtype<T>(1),
+                                                           subtype<T> cutoff = 0.5f)
 {
     using itype = typename samplerate_converter<T>::itype;
     return samplerate_converter<T>(quality, itype(interpolation_factor), itype(decimation_factor), scale,
@@ -376,12 +395,13 @@ inline samplerate_converter<T> sample_rate_converter(sample_rate_conversion_qual
 
 // Deprecated in 0.9.2
 template <typename T = fbase>
-inline samplerate_converter<T> resampler(sample_rate_conversion_quality quality, size_t interpolation_factor,
-                                         size_t decimation_factor, subtype<T> scale = subtype<T>(1),
-                                         subtype<T> cutoff = 0.5f)
+KFR_FUNCTION samplerate_converter<T> resampler(sample_rate_conversion_quality quality,
+                                               size_t interpolation_factor, size_t decimation_factor,
+                                               subtype<T> scale = subtype<T>(1), subtype<T> cutoff = 0.5f)
 {
     using itype = typename samplerate_converter<T>::itype;
     return samplerate_converter<T>(quality, itype(interpolation_factor), itype(decimation_factor), scale,
                                    cutoff);
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/speaker.hpp b/include/kfr/dsp/speaker.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
  *  @{
  */
 /*
@@ -27,6 +27,8 @@
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 enum class Speaker : int
 {
@@ -93,4 +95,5 @@ enum class SpeakerArrangement : int
     Music81        = 27,
     Arr102         = 28
 };
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/special.hpp b/include/kfr/dsp/special.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
  *  @{
  */
 /*
@@ -26,16 +26,19 @@
 #pragma once
 
 #include "../base/basic_expressions.hpp"
-#include "../base/operators.hpp"
-#include "../base/vec.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
 /**
  * @brief Returns expression template that generates a unit impulse
  */
 template <typename T = int>
-static auto unitimpulse()
+auto unitimpulse()
 {
     return lambda<T>([](cinput_t, size_t index, auto x) {
         if (index == 0)
@@ -46,7 +49,7 @@ static auto unitimpulse()
 }
 
 template <typename T = fbase>
-static auto jaehne_arg(size_t size)
+auto jaehne_arg(size_t size)
 {
     return truncate(constants<T>::pi_s(1, 2) * sqr(linspace(T(0), T(size), size, false)) / size, size);
 }
@@ -56,13 +59,13 @@ static auto jaehne_arg(size_t size)
  * Generates the sine with linearly increasing frequency from 0hz to nyquist frequency.
  */
 template <typename T = fbase>
-static auto jaehne(identity<T> magn, size_t size)
+auto jaehne(identity<T> magn, size_t size)
 {
     return magn * sin(jaehne_arg<T>(size));
 }
 
 template <typename T = fbase>
-static auto swept_arg(size_t size)
+auto swept_arg(size_t size)
 {
     return truncate(constants<T>::pi_s(1, 4) * sqr(sqr(linspace(T(0), T(size), size, false)) / sqr(T(size))) *
                         T(size),
@@ -74,8 +77,9 @@ static auto swept_arg(size_t size)
  * Generates the sine with logarithmically increasing frequency from 0hz to nyquist frequency.
  */
 template <typename T = fbase>
-static auto swept(identity<T> magn, size_t size)
+auto swept(identity<T> magn, size_t size)
 {
     return magn * sin(swept_arg<T>(size));
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
  *  @{
  */
 /*
@@ -25,41 +25,43 @@
  */
 #pragma once
 
-#include "../base/abs.hpp"
 #include "../base/basic_expressions.hpp"
-#include "../base/log_exp.hpp"
-#include "../base/vec.hpp"
+#include "../math/abs.hpp"
+#include "../math/log_exp.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 using sample_rate_t = double;
 
 namespace intrinsics
 {
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF amp_to_dB(const T& amp)
+KFR_INTRINSIC TF amp_to_dB(const T& amp)
 {
     return log(static_cast<TF>(abs(amp))) * subtype<TF>(8.6858896380650365530225783783322);
     // return T( 20.0 ) * log10( level );
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF dB_to_amp(const T& dB)
+KFR_INTRINSIC TF dB_to_amp(const T& dB)
 {
     return exp(dB * subtype<TF>(0.11512925464970228420089957273422));
     // return exp10( dB / 20 );
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF amp_to_dB(const T& amp, const T& offset)
+KFR_INTRINSIC TF amp_to_dB(const T& amp, const T& offset)
 {
     return log_fmadd(static_cast<TF>(abs(amp)), subtype<TF>(8.6858896380650365530225783783322), offset);
     // return T( 20.0 ) * log10( level );
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset)
+KFR_INTRINSIC TF dB_to_amp(const T& dB, const T& offset)
 {
     auto offs = -subtype<TF>(0.11512925464970228420089957273422) * offset;
     return exp_fmadd(dB, subtype<TF>(0.11512925464970228420089957273422), offs);
@@ -67,13 +69,13 @@ KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset)
 }
 
 template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout power_to_dB(const T& x)
+KFR_INTRINSIC Tout power_to_dB(const T& x)
 {
     return log(static_cast<Tout>(abs(x))) * (10 * c_recip_log_10<Tout>);
 }
 
 template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout dB_to_power(const T& x)
+KFR_INTRINSIC Tout dB_to_power(const T& x)
 {
     if (x == -c_infinity<Tout>)
         return 0.0;
@@ -82,7 +84,7 @@ KFR_SINTRIN Tout dB_to_power(const T& x)
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF note_to_hertz(const T& note)
+KFR_INTRINSIC TF note_to_hertz(const T& note)
 {
     const subtype<TF> offset = 2.1011784386926213177653145771814;
 
@@ -90,7 +92,7 @@ KFR_SINTRIN TF note_to_hertz(const T& note)
 }
 
 template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF hertz_to_note(const T& hertz)
+KFR_INTRINSIC TF hertz_to_note(const T& hertz)
 {
     const subtype<TF> offset = -36.376316562295915248836189714583;
 
@@ -98,7 +100,7 @@ KFR_SINTRIN TF hertz_to_note(const T& hertz)
 }
 
 template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>>
-KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz)
+KFR_INTRINSIC Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz)
 {
     const Tc offset = log(tunehertz) - tunenote * subtype<Tc>(0.05776226504666210911810267678818);
 
@@ -106,7 +108,7 @@ KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tuneh
 }
 
 template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>>
-KFR_SINTRIN Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz)
+KFR_INTRINSIC Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz)
 {
     const Tc offset = tunenote - log(tunehertz) * subtype<Tc>(17.312340490667560888319096172023);
 
@@ -121,74 +123,75 @@ KFR_I_FN(power_to_dB)
 KFR_I_FN(dB_to_power)
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> note_to_hertz(const T1& x)
+KFR_FUNCTION flt_type<T1> note_to_hertz(const T1& x)
 {
     return intrinsics::note_to_hertz(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::note_to_hertz, E1> note_to_hertz(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::note_to_hertz, E1> note_to_hertz(E1&& x)
 {
     return { fn::note_to_hertz(), std::forward<E1>(x) };
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> hertz_to_note(const T1& x)
+KFR_FUNCTION flt_type<T1> hertz_to_note(const T1& x)
 {
     return intrinsics::hertz_to_note(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::hertz_to_note, E1> hertz_to_note(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::hertz_to_note, E1> hertz_to_note(E1&& x)
 {
     return { fn::hertz_to_note(), std::forward<E1>(x) };
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> amp_to_dB(const T1& x)
+KFR_FUNCTION flt_type<T1> amp_to_dB(const T1& x)
 {
     return intrinsics::amp_to_dB(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::amp_to_dB, E1> amp_to_dB(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::amp_to_dB, E1> amp_to_dB(E1&& x)
 {
     return { fn::amp_to_dB(), std::forward<E1>(x) };
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> dB_to_amp(const T1& x)
+KFR_FUNCTION flt_type<T1> dB_to_amp(const T1& x)
 {
     return intrinsics::dB_to_amp(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::dB_to_amp, E1> dB_to_amp(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::dB_to_amp, E1> dB_to_amp(E1&& x)
 {
     return { fn::dB_to_amp(), std::forward<E1>(x) };
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> power_to_dB(const T1& x)
+KFR_FUNCTION flt_type<T1> power_to_dB(const T1& x)
 {
     return intrinsics::power_to_dB(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::power_to_dB, E1> power_to_dB(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::power_to_dB, E1> power_to_dB(E1&& x)
 {
     return { fn::power_to_dB(), std::forward<E1>(x) };
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> dB_to_power(const T1& x)
+KFR_FUNCTION flt_type<T1> dB_to_power(const T1& x)
 {
     return intrinsics::dB_to_power(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::dB_to_power, E1> dB_to_power(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::dB_to_power, E1> dB_to_power(E1&& x)
 {
     return { fn::dB_to_power(), std::forward<E1>(x) };
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/waveshaper.hpp b/include/kfr/dsp/waveshaper.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
  *  @{
  */
 /*
@@ -25,12 +25,15 @@
  */
 #pragma once
 
-#include "../base/clamp.hpp"
-#include "../base/hyperbolic.hpp"
-#include "../base/operators.hpp"
+#include "../math/clamp.hpp"
+#include "../math/hyperbolic.hpp"
+#include "../simd/operators.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
 template <typename E1>
 inline auto waveshaper_hardclip(E1&& input, double clip_level)
 {
@@ -44,7 +47,7 @@ inline auto waveshaper_tanh(E1&& input, double saturation)
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-CMT_FUNC flt_type<T1> saturate_I(const T1& x)
+KFR_FUNCTION flt_type<T1> saturate_I(const T1& x)
 {
     const flt_type<T1> xx = -1 / (abs(static_cast<flt_type<T1>>(x)) + 1) + 1;
     return mulsign(xx, static_cast<flt_type<T1>>(x));
@@ -52,7 +55,7 @@ CMT_FUNC flt_type<T1> saturate_I(const T1& x)
 KFR_FN(saturate_I)
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-CMT_FUNC flt_type<T1> saturate_II(const T1& x)
+KFR_FUNCTION flt_type<T1> saturate_II(const T1& x)
 {
     const flt_type<T1> xx = sqr(abs(static_cast<flt_type<T1>>(x)) + 1);
     return mulsign((xx - 1) / (xx + 1), static_cast<flt_type<T1>>(x));
@@ -60,13 +63,13 @@ CMT_FUNC flt_type<T1> saturate_II(const T1& x)
 KFR_FN(saturate_II)
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_FUNC internal::expression_function<fn::saturate_II, E1> saturate_I(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::saturate_II, E1> saturate_I(E1&& x)
 {
     return { fn::saturate_I(), std::forward<E1>(x) };
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_FUNC internal::expression_function<fn::saturate_II, E1> saturate_II(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::saturate_II, E1> saturate_II(E1&& x)
 {
     return { fn::saturate_II(), std::forward<E1>(x) };
 }
@@ -88,4 +91,5 @@ inline auto waveshaper_poly(E1&& input, fbase c1, fbase c3, Cs... cs)
 {
     return horner_odd(input, c1, c3, static_cast<fbase>(cs)...);
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/weighting.hpp b/include/kfr/dsp/weighting.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
  *  @{
  */
 /*
@@ -25,16 +25,19 @@
  */
 #pragma once
 
-#include "../base/operators.hpp"
-#include "../base/sqrt.hpp"
+#include "../math/sqrt.hpp"
+#include "../simd/operators.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
+
 namespace intrinsics
 {
 
 template <typename T>
-KFR_SINTRIN T weight_a_unnorm(T f)
+KFR_INTRINSIC T weight_a_unnorm(T f)
 {
     const T f2  = pow2(f);
     const T nom = pow2(12200) * pow4(f);
@@ -46,13 +49,13 @@ template <typename T>
 const static T weight_a_gain = reciprocal(weight_a_unnorm(T(1000.0)));
 
 template <typename T>
-KFR_SINTRIN T aweighting(T f)
+KFR_INTRINSIC T aweighting(T f)
 {
     return weight_a_unnorm(f) * weight_a_gain<subtype<T>>;
 }
 
 template <typename T>
-KFR_SINTRIN T weight_b_unnorm(T f)
+KFR_INTRINSIC T weight_b_unnorm(T f)
 {
     const T f2  = pow2(f);
     const T nom = pow2(12200) * pow3(f);
@@ -65,13 +68,13 @@ template <typename T>
 const static T weight_b_gain = reciprocal(weight_b_unnorm(T(1000.0)));
 
 template <typename T>
-KFR_SINTRIN T bweighting(T f)
+KFR_INTRINSIC T bweighting(T f)
 {
     return weight_b_unnorm(f) * weight_b_gain<subtype<T>>;
 }
 
 template <typename T>
-KFR_SINTRIN T weight_c_unnorm(T f)
+KFR_INTRINSIC T weight_c_unnorm(T f)
 {
     const T f2  = pow2(f);
     const T nom = pow2(12200) * f2;
@@ -84,7 +87,7 @@ template <typename T>
 const static T weight_c_gain = reciprocal(weight_c_unnorm(T(1000.0)));
 
 template <typename T>
-KFR_SINTRIN T cweighting(T f)
+KFR_INTRINSIC T cweighting(T f)
 {
     return weight_c_unnorm(f) * weight_c_gain<subtype<T>>;
 }
@@ -94,38 +97,39 @@ KFR_I_FN(bweighting)
 KFR_I_FN(cweighting)
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 aweighting(const T1& x)
+KFR_INTRINSIC T1 aweighting(const T1& x)
 {
     return intrinsics::aweighting(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::aweighting, E1> aweighting(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::aweighting, E1> aweighting(E1&& x)
 {
     return { fn::aweighting(), std::forward<E1>(x) };
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 bweighting(const T1& x)
+KFR_INTRINSIC T1 bweighting(const T1& x)
 {
     return intrinsics::bweighting(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::bweighting, E1> bweighting(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::bweighting, E1> bweighting(E1&& x)
 {
     return { fn::bweighting(), std::forward<E1>(x) };
 }
 
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 cweighting(const T1& x)
+KFR_INTRINSIC T1 cweighting(const T1& x)
 {
     return intrinsics::cweighting(x);
 }
 
 template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::cweighting, E1> cweighting(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::cweighting, E1> cweighting(E1&& x)
 {
     return { fn::cweighting(), std::forward<E1>(x) };
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup window
  *  @{
  */
 /*
@@ -25,15 +25,17 @@
  */
 #pragma once
 
-#include "../base/log_exp.hpp"
-#include "../base/modzerobessel.hpp"
 #include "../base/pointer.hpp"
-#include "../base/sin_cos.hpp"
-#include "../base/sqrt.hpp"
-#include "../base/vec.hpp"
+#include "../math/log_exp.hpp"
+#include "../math/modzerobessel.hpp"
+#include "../math/sin_cos.hpp"
+#include "../math/sqrt.hpp"
+#include "../simd/vec.hpp"
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 enum class window_type
 {
@@ -125,11 +127,12 @@ struct expression_rectangular : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_rectangular& self, cinput_t,
+                                                    size_t index, vec_shape<T, N>)
     {
         using TI           = utype<T>;
-        const vec<TI, N> i = enumerate(vec<TI, N>()) + cast<TI>(index);
-        return select(i < cast<TI>(m_size), T(1), T(0));
+        const vec<TI, N> i = enumerate(vec<TI, N>()) + static_cast<TI>(index);
+        return select(i < static_cast<TI>(self.m_size), T(1), T(0));
     }
     size_t size() const { return m_size; }
 
@@ -147,9 +150,10 @@ struct expression_triangular : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_triangular& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return 1 - abs(linspace(cinput, index, y));
+        return 1 - abs(get_elements(self.linspace, cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -168,9 +172,10 @@ struct expression_bartlett : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bartlett& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return 1 - abs(linspace(cinput, index, y));
+        return 1 - abs(get_elements(self.linspace, cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -189,9 +194,10 @@ struct expression_cosine : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_cosine& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return sin(c_pi<T> * linspace(cinput, index, y));
+        return sin(c_pi<T> * get_elements(self.linspace, cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -210,9 +216,10 @@ struct expression_hann : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_hann& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y)));
+        return T(0.5) * (T(1) - cos(c_pi<T, 2> * get_elements(self.linspace, cinput, index, y)));
     }
     size_t size() const { return m_size; }
 
@@ -231,9 +238,10 @@ struct expression_bartlett_hann : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bartlett_hann& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        const vec<T, N> xx = linspace(cinput, index, y);
+        const vec<T, N> xx = get_elements(self.linspace, cinput, index, y);
         return T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5)));
     }
     size_t size() const { return m_size; }
@@ -253,9 +261,11 @@ struct expression_hamming : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_hamming& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return alpha - (T(1.0) - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y)));
+        return self.alpha -
+               (T(1.0) - self.alpha) * (cos(c_pi<T, 2> * get_elements(self.linspace, cinput, index, y)));
     }
     size_t size() const { return m_size; }
 
@@ -275,9 +285,10 @@ struct expression_bohman : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bohman& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        const vec<T, N> n = abs(linspace(cinput, index, y));
+        const vec<T, N> n = abs(get_elements(self.linspace, cinput, index, y));
         return (T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n);
     }
     size_t size() const { return m_size; }
@@ -297,10 +308,11 @@ struct expression_blackman : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_blackman& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        const vec<T, N> n = linspace(cinput, index, y);
-        return a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n);
+        const vec<T, N> n = get_elements(self.linspace, cinput, index, y);
+        return self.a0 - self.a1 * cos(c_pi<T, 2> * n) + self.a2 * cos(c_pi<T, 4> * n);
     }
     size_t size() const { return m_size; }
 
@@ -320,9 +332,10 @@ struct expression_blackman_harris : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_blackman_harris& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
+        const vec<T, N> n = get_elements(self.linspace, cinput, index, y) * c_pi<T, 2>;
         return T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n);
     }
     size_t size() const { return m_size; }
@@ -343,9 +356,11 @@ struct expression_kaiser : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_kaiser& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m;
+        return modzerobessel(self.beta * sqrt(1 - sqr(get_elements(self.linspace, cinput, index, y)))) *
+               self.m;
     }
     size_t size() const { return m_size; }
 
@@ -366,9 +381,10 @@ struct expression_flattop : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_flattop& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
+        const vec<T, N> n = get_elements(self.linspace, cinput, index, y) * c_pi<T, 2>;
         constexpr T a0    = 1;
         constexpr T a1    = 1.93;
         constexpr T a2    = 1.29;
@@ -393,9 +409,10 @@ struct expression_gaussian : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_gaussian& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return exp(T(-0.5) * sqr(alpha * linspace(cinput, index, y)));
+        return exp(T(-0.5) * sqr(self.alpha * get_elements(self.linspace, cinput, index, y)));
     }
 
     size_t size() const { return m_size; }
@@ -416,9 +433,10 @@ struct expression_lanczos : input_expression
     {
     }
     template <size_t N>
-    CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+    KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lanczos& self, cinput_t cinput,
+                                                    size_t index, vec_shape<T, N> y)
     {
-        return sinc(linspace(cinput, index, y));
+        return sinc(get_elements(self.linspace, cinput, index, y));
     }
     size_t size() const { return m_size; }
 
@@ -458,7 +476,7 @@ KFR_WINDOW_BY_TYPE(lanczos)
 /**
  * @brief Returns template expression that generates Rrectangular window of length @c size
  */
-CMT_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t size)
+KFR_FUNCTION internal::expression_rectangular<fbase> window_rectangular(size_t size)
 {
     return internal::expression_rectangular<fbase>(size, fbase());
 }
@@ -467,7 +485,7 @@ CMT_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t siz
  * @brief Returns template expression that generates Triangular window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_triangular<T>(size);
 }
@@ -476,7 +494,7 @@ CMT_INLINE internal::expression_triangular<T> window_triangular(size_t size, cty
  * @brief Returns template expression that generates Bartlett window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_bartlett<T>(size);
 }
@@ -485,7 +503,7 @@ CMT_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t
  * @brief Returns template expression that generates Cosine window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_cosine<T>(size);
 }
@@ -494,7 +512,7 @@ CMT_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> 
  * @brief Returns template expression that generates Hann window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_hann<T>(size);
 }
@@ -503,7 +521,8 @@ CMT_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ct
  * @brief Returns template expression that generates Bartlett-Hann window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size,
+                                                                        ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_bartlett_hann<T>(size);
 }
@@ -513,8 +532,8 @@ CMT_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t siz
  * alpha
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_hamming<T> window_hamming(size_t size, identity<T> alpha = 0.54,
-                                                          ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_hamming<T> window_hamming(size_t size, identity<T> alpha = 0.54,
+                                                            ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_hamming<T>(size, alpha);
 }
@@ -523,7 +542,7 @@ CMT_INLINE internal::expression_hamming<T> window_hamming(size_t size, identity<
  * @brief Returns template expression that generates Bohman window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_bohman<T>(size);
 }
@@ -533,7 +552,7 @@ CMT_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> 
  * alpha
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_blackman<T> window_blackman(
+KFR_FUNCTION internal::expression_blackman<T> window_blackman(
     size_t size, identity<T> alpha = 0.16, window_symmetry symmetry = window_symmetry::symmetric,
     ctype_t<T> = ctype_t<T>())
 {
@@ -544,7 +563,7 @@ CMT_INLINE internal::expression_blackman<T> window_blackman(
  * @brief Returns template expression that generates Blackman-Harris window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_blackman_harris<T> window_blackman_harris(
+KFR_FUNCTION internal::expression_blackman_harris<T> window_blackman_harris(
     size_t size, window_symmetry symmetry = window_symmetry::symmetric, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_blackman_harris<T>(size, T(), symmetry);
@@ -555,8 +574,8 @@ CMT_INLINE internal::expression_blackman_harris<T> window_blackman_harris(
  * beta
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> beta = T(0.5),
-                                                        ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> beta = T(0.5),
+                                                          ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_kaiser<T>(size, beta);
 }
@@ -565,7 +584,7 @@ CMT_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, identity<T>
  * @brief Returns template expression that generates Flat top window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_flattop<T>(size);
 }
@@ -575,8 +594,8 @@ CMT_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T
  * alpha
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, identity<T> alpha = 2.5,
-                                                            ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_gaussian<T> window_gaussian(size_t size, identity<T> alpha = 2.5,
+                                                              ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_gaussian<T>(size, alpha);
 }
@@ -585,7 +604,7 @@ CMT_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, identit
  * @brief Returns template expression that generates Lanczos window of length @c size
  */
 template <typename T = fbase>
-CMT_INLINE internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
 {
     return internal::expression_lanczos<T>(size);
 }
@@ -615,6 +634,7 @@ CMT_NOINLINE expression_pointer<T> window(size_t size, window_type type, identit
             return to_pointer(
                 typename internal::window_by_type<window>::template type<T>(size, win_param, symmetry));
         },
-        fn::returns<expression_pointer<T>>());
+        fn_generic::returns<expression_pointer<T>>());
 }
+} // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/ext/console_colors.hpp b/include/kfr/ext/console_colors.hpp
@@ -1,162 +0,0 @@
-#pragma once
-#include <cstdint>
-#include <cstdio>
-
-//#define CONSOLE_COLORS_FORCE_ASCII
-
-#if defined _WIN32 && !defined PRINT_COLORED_FORCE_ASCII
-#define USE_WIN32_API
-#endif
-
-#if defined(USE_WIN32_API)
-
-namespace win32_lite
-{
-typedef void* HANDLE;
-typedef uint32_t DWORD;
-
-#define WIN32_LITE_STD_INPUT_HANDLE ((win32_lite::DWORD)-10)
-#define WIN32_LITE_STD_OUTPUT_HANDLE ((win32_lite::DWORD)-11)
-#define WIN32_LITE_STD_ERROR_HANDLE ((win32_lite::DWORD)-12)
-
-#define WIN32_LITE_DECLSPEC_IMPORT __declspec(dllimport)
-
-#define WIN32_LITE_WINAPI __stdcall
-
-typedef short SHORT;
-typedef unsigned short WORD;
-typedef int WINBOOL;
-
-extern "C"
-{
-    WIN32_LITE_DECLSPEC_IMPORT HANDLE WIN32_LITE_WINAPI GetStdHandle(DWORD nStdHandle);
-    WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleTextAttribute(HANDLE hConsoleOutput,
-                                                                                 WORD wAttributes);
-}
-} // namespace win32_lite
-
-#endif
-
-namespace console_colors
-{
-
-enum text_color : uint32_t
-{
-    Black         = 0x00,
-    DarkBlue      = 0x01,
-    DarkGreen     = 0x02,
-    DarkCyan      = 0x03,
-    DarkRed       = 0x04,
-    DarkMagenta   = 0x05,
-    DarkYellow    = 0x06,
-    LightGrey     = 0x07,
-    Gray          = 0x08,
-    Blue          = 0x09,
-    Green         = 0x0A,
-    Cyan          = 0x0B,
-    Red           = 0x0C,
-    Magenta       = 0x0D,
-    Yellow        = 0x0E,
-    White         = 0x0F,
-    BgBlack       = 0x00,
-    BgDarkBlue    = 0x10,
-    BgDarkGreen   = 0x20,
-    BgDarkCyan    = 0x30,
-    BgDarkRed     = 0x40,
-    BgDarkMagenta = 0x50,
-    BgDarkYellow  = 0x60,
-    BgLightGrey   = 0x70,
-    BgGray        = 0x80,
-    BgBlue        = 0x90,
-    BgGreen       = 0xA0,
-    BgCyan        = 0xB0,
-    BgRed         = 0xC0,
-    BgMagenta     = 0xD0,
-    BgYellow      = 0xE0,
-    BgWhite       = 0xF0,
-
-    Normal = BgBlack | LightGrey
-};
-
-enum console_buffer
-{
-    ConsoleStdOutput,
-    ConsoleStdError
-};
-
-struct console_color
-{
-public:
-    console_color(text_color c, console_buffer console = ConsoleStdOutput)
-        : m_old(get(console)), m_console(console)
-    {
-        set(c, m_console);
-    }
-
-    ~console_color() { set(m_old, m_console); }
-
-private:
-    text_color get(console_buffer console = ConsoleStdOutput) { return saved_color(); }
-
-    void set(text_color new_color, console_buffer console = ConsoleStdOutput)
-    {
-#ifdef USE_WIN32_API
-        win32_lite::SetConsoleTextAttribute(win32_lite::GetStdHandle(console == ConsoleStdOutput
-                                                                         ? WIN32_LITE_STD_OUTPUT_HANDLE
-                                                                         : WIN32_LITE_STD_ERROR_HANDLE),
-                                            static_cast<win32_lite::WORD>(new_color));
-#else
-        if (new_color != Normal)
-        {
-            uint8_t t    = new_color & 0xF;
-            uint8_t b    = (new_color & 0xF0) >> 4;
-            uint8_t tnum = 30 + ((t & 1) << 2 | (t & 2) | (t & 4) >> 2);
-            uint8_t bnum = 40 + ((b & 1) << 2 | (b & 2) | (b & 4) >> 2);
-            if (t & 8)
-                tnum += 60;
-            if (b & 8)
-                bnum += 60;
-            std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[%d;%dm", tnum, bnum);
-        }
-        else
-        {
-            std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[0m");
-        }
-#endif
-        saved_color() = new_color;
-    }
-
-    text_color m_old;
-    console_buffer m_console;
-    static text_color& saved_color()
-    {
-        static text_color color = Normal;
-        return color;
-    }
-};
-
-template <text_color color, console_buffer console = ConsoleStdOutput>
-struct console_color_tpl : public console_color
-{
-public:
-    console_color_tpl() : console_color(color, console) {}
-
-private:
-};
-
-typedef console_color_tpl<DarkBlue> darkblue_text;
-typedef console_color_tpl<DarkGreen> darkgreen_text;
-typedef console_color_tpl<DarkCyan> darkcyan_text;
-typedef console_color_tpl<DarkRed> darkred_text;
-typedef console_color_tpl<DarkMagenta> darkmagenta_text;
-typedef console_color_tpl<DarkYellow> darkyellow_text;
-typedef console_color_tpl<LightGrey> lightgrey_text;
-typedef console_color_tpl<Gray> gray_text;
-typedef console_color_tpl<Blue> blue_text;
-typedef console_color_tpl<Green> green_text;
-typedef console_color_tpl<Cyan> cyan_text;
-typedef console_color_tpl<Red> red_text;
-typedef console_color_tpl<Magenta> magenta_text;
-typedef console_color_tpl<Yellow> yellow_text;
-typedef console_color_tpl<White> white_text;
-} // namespace console_colors
diff --git a/include/kfr/ext/double_double.hpp b/include/kfr/ext/double_double.hpp
@@ -1,86 +0,0 @@
-#pragma once
-
-#include <cmath>
-
-struct double_double
-{
-    double hi, lo;
-
-    constexpr double_double(double x) noexcept : hi(x), lo(0.0) {}
-    constexpr double_double(float x) noexcept : hi(x), lo(0.0) {}
-    constexpr double_double(double hi, double lo) noexcept : hi(hi + lo), lo((hi - (hi + lo)) + lo) {}
-    constexpr operator double() const noexcept { return hi + lo; }
-    constexpr operator float() const noexcept { return hi + lo; }
-
-    constexpr friend double_double operator-(const double_double& x) noexcept { return { -x.hi, -x.lo }; }
-    constexpr friend double_double operator+(const double_double& x, const double_double& y) noexcept
-    {
-        const double sum = x.hi + y.hi;
-        return { sum, std::abs(x.hi) > std::abs(y.hi) ? (((x.hi - sum) + y.hi) + y.lo) + x.lo
-                                                      : (((y.hi - sum) + x.hi) + x.lo) + y.lo };
-    }
-    constexpr friend double_double operator-(const double_double& x, const double_double& y) noexcept
-    {
-        const double diff = x.hi - y.hi;
-        return { diff, std::abs(x.hi) > std::abs(y.hi) ? (((x.hi - diff) - y.hi) - y.lo) + x.lo
-                                                       : (((-y.hi - diff) + x.hi) + x.lo) - y.lo };
-    }
-    constexpr friend double_double operator*(const double_double& x, const double_double& y) noexcept
-    {
-        const double_double c = mul(x.hi, y.hi);
-        const double cc       = (x.hi * y.lo + x.lo * y.hi) + c.lo;
-        return { c.hi, cc };
-    }
-    constexpr friend double_double operator/(const double_double& x, const double_double& y) noexcept
-    {
-        const double c        = x.hi / y.hi;
-        const double_double u = mul(c, y.hi);
-        const double cc       = ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) / y.hi;
-        return { c, cc };
-    }
-
-#if defined _MSC_VER && !defined __clang__
-#define DOUBLEDOUBLE_CONSTEXPR
-#else
-#define DOUBLEDOUBLE_CONSTEXPR constexpr
-#endif
-
-    DOUBLEDOUBLE_CONSTEXPR bool isinf() const noexcept { return std::isinf(hi); }
-    DOUBLEDOUBLE_CONSTEXPR bool isnan() const noexcept { return std::isnan(hi) || std::isnan(lo); }
-
-    DOUBLEDOUBLE_CONSTEXPR double ulp(float value) const noexcept
-    {
-        if (std::isnan(value) && isnan())
-            return 0.0;
-        if (std::isinf(value) && isinf() && (std::copysign(1.0f, value) == std::copysign(1.0, hi)))
-            return 0.0;
-        if (std::nexttoward(value, 0.0) == 0.0)
-            return 1.0;
-        return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
-    }
-    DOUBLEDOUBLE_CONSTEXPR double ulp(double value) const noexcept
-    {
-        if (std::isnan(value) && isnan())
-            return 0.0;
-        if (std::isinf(value) && isinf() && (std::copysign(1.0, value) == std::copysign(1.0, hi)))
-            return 0.0;
-        if (std::nexttoward(value, 0.0) == 0.0)
-            return 1.0;
-        return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
-    }
-
-private:
-    constexpr static double_double splitprec(double x) noexcept
-    {
-        const double p = x * 1.34217729e8;
-        const double h = (x - p) + p;
-        return { h, x - h };
-    }
-    constexpr static double_double mul(double x, double y) noexcept
-    {
-        const double_double xx = splitprec(x);
-        const double_double yy = splitprec(y);
-        const double z         = x * y;
-        return { z, ((xx.hi * yy.hi - z) + xx.hi * yy.lo + xx.lo * yy.hi) + xx.lo * yy.lo };
-    }
-};
diff --git a/include/kfr/io/audiofile.hpp b/include/kfr/io/audiofile.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup audio_io
  *  @{
  */
 /*
@@ -28,8 +28,8 @@
 #include "../base/basic_expressions.hpp"
 #include "../base/conversion.hpp"
 #include "../base/univector.hpp"
-#include "../base/vec.hpp"
 #include "../cometa/ctti.hpp"
+#include "../simd/vec.hpp"
 #include "file.hpp"
 
 #ifndef KFR_ENABLE_WAV
@@ -64,10 +64,8 @@ struct audio_format
 struct audio_format_and_length : audio_format
 {
     using audio_format::audio_format;
-#ifdef CMT_COMPILER_MSVC
-    audio_format_and_length() noexcept {}
-#endif
-    audio_format_and_length(const audio_format& fmt) : audio_format(fmt) {}
+    constexpr audio_format_and_length() CMT_NOEXCEPT {}
+    constexpr audio_format_and_length(const audio_format& fmt) : audio_format(fmt) {}
 
     imax length = 0; // in samples
 };
@@ -95,39 +93,43 @@ struct audio_writer : public abstract_writer<T>
     virtual void close() = 0;
 };
 
-namespace internal
+namespace internal_generic
 {
 #if KFR_ENABLE_WAV
-static size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData, size_t bytesToWrite)
+static inline size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData,
+                                             size_t bytesToWrite)
 {
     return file->write(pData, bytesToWrite);
 }
-static drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset, drwav_seek_origin origin)
+static inline drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset,
+                                                  drwav_seek_origin origin)
 {
     return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current);
 }
-static size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead)
+static inline size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead)
 {
     return file->read(pBufferOut, bytesToRead);
 }
-static drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset, drwav_seek_origin origin)
+static inline drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset,
+                                                  drwav_seek_origin origin)
 {
     return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current);
 }
 #endif
 #if KFR_ENABLE_FLAC
-static size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead)
+static inline size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut,
+                                             size_t bytesToRead)
 {
     return file->read(pBufferOut, bytesToRead);
 }
-static drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset,
-                                             drflac_seek_origin origin)
+static inline drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset,
+                                                    drflac_seek_origin origin)
 {
     return file->seek(offset, origin == drflac_seek_origin_start ? seek_origin::begin : seek_origin::current);
 }
 #endif
 
-} // namespace internal
+} // namespace internal_generic
 
 #if KFR_ENABLE_WAV
 /// @brief WAV format writer
@@ -139,17 +141,19 @@ struct audio_writer_wav : audio_writer<T>
         : writer(std::move(writer)), f(nullptr), fmt(fmt)
     {
         drwav_data_format wav_fmt;
-        wav_fmt.channels   = fmt.channels;
-        wav_fmt.sampleRate = fmt.samplerate;
+        wav_fmt.channels   = static_cast<drwav_uint32>(fmt.channels);
+        wav_fmt.sampleRate = static_cast<drwav_uint32>(fmt.samplerate);
         wav_fmt.format =
             fmt.type >= audio_sample_type::first_float ? DR_WAVE_FORMAT_IEEE_FLOAT : DR_WAVE_FORMAT_PCM;
-        wav_fmt.bitsPerSample = audio_sample_bit_depth(fmt.type);
+        wav_fmt.bitsPerSample = static_cast<drwav_uint32>(audio_sample_bit_depth(fmt.type));
         wav_fmt.container     = fmt.use_w64 ? drwav_container_w64 : drwav_container_riff;
-        f = drwav_open_write(&wav_fmt, (drwav_write_proc)&internal::drwav_writer_write_proc,
-                             (drwav_seek_proc)&internal::drwav_writer_seek_proc, this->writer.get());
+        f = drwav_open_write(&wav_fmt, (drwav_write_proc)&internal_generic::drwav_writer_write_proc,
+                             (drwav_seek_proc)&internal_generic::drwav_writer_seek_proc, this->writer.get());
     }
     ~audio_writer_wav() { close(); }
 
+    using audio_writer<T>::write;
+
     /// @brief Write data to underlying binary writer
     size_t write(const T* data, size_t size) override
     {
@@ -184,7 +188,7 @@ struct audio_writer_wav : audio_writer<T>
 
     imax tell() const override { return fmt.length; }
 
-    bool seek(imax position, seek_origin origin) override { return false; }
+    bool seek(imax, seek_origin) override { return false; }
 
 private:
     std::shared_ptr<abstract_writer<>> writer;
@@ -199,8 +203,8 @@ struct audio_reader_wav : audio_reader<T>
     /// @brief Constructs WAV reader
     audio_reader_wav(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader))
     {
-        f              = drwav_open((drwav_read_proc)&internal::drwav_reader_read_proc,
-                       (drwav_seek_proc)&internal::drwav_reader_seek_proc, this->reader.get());
+        f              = drwav_open((drwav_read_proc)&internal_generic::drwav_reader_read_proc,
+                       (drwav_seek_proc)&internal_generic::drwav_reader_seek_proc, this->reader.get());
         fmt.channels   = f->channels;
         fmt.samplerate = f->sampleRate;
         fmt.length     = f->totalSampleCount / fmt.channels;
@@ -307,8 +311,8 @@ struct audio_reader_flac : audio_reader<T>
     /// @brief Constructs FLAC reader
     audio_reader_flac(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader))
     {
-        f              = drflac_open((drflac_read_proc)&internal::drflac_reader_read_proc,
-                        (drflac_seek_proc)&internal::drflac_reader_seek_proc, this->reader.get());
+        f              = drflac_open((drflac_read_proc)&internal_generic::drflac_reader_read_proc,
+                        (drflac_seek_proc)&internal_generic::drflac_reader_seek_proc, this->reader.get());
         fmt.channels   = f->channels;
         fmt.samplerate = f->sampleRate;
         fmt.length     = f->totalSampleCount / fmt.channels;
diff --git a/include/kfr/io/file.hpp b/include/kfr/io/file.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup binary_io
  *  @{
  */
 /*
@@ -25,9 +25,9 @@
  */
 #pragma once
 
-#include "../base/function.hpp"
 #include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/vec.hpp"
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -63,6 +63,7 @@ inline FILE* fopen_portable(const filepath_char* path, const filepath_char* mode
 #ifdef CMT_OS_WIN
     FILE* f   = nullptr;
     errno_t e = _wfopen_s(&f, path, mode);
+    (void)e;
     return f;
 #else
     return fopen(path, mode);
@@ -98,6 +99,14 @@ struct abstract_stream
     bool seek(imax offset, int origin) { return seek(offset, static_cast<seek_origin>(origin)); }
 };
 
+namespace internal_generic
+{
+struct empty
+{
+};
+
+} // namespace internal_generic
+
 /// @brief Base class for all typed readers
 template <typename T = void>
 struct abstract_reader : abstract_stream<T>
@@ -117,6 +126,10 @@ struct abstract_reader : abstract_stream<T>
         this->read(result);
         return result;
     }
+    bool read(conditional<is_void<T>::value, internal_generic::empty, T>& data)
+    {
+        return read(&data, 1) == 1;
+    }
 };
 
 /// @brief Base class for all typed writers
@@ -131,6 +144,10 @@ struct abstract_writer : abstract_stream<T>
         return write(data.data(), data.size());
     }
     size_t write(univector_ref<const T>&& data) { return write(data.data(), data.size()); }
+    bool write(const conditional<is_void<T>::value, internal_generic::empty, T>& data)
+    {
+        return write(&data, 1) == 1;
+    }
 };
 
 template <typename From, typename To = void>
@@ -207,6 +224,8 @@ struct file_reader : abstract_reader<T>
     ~file_reader() override {}
     size_t read(T* data, size_t size) final { return fread(data, element_size<T>(), size, handle.file); }
 
+    using abstract_reader<T>::read;
+
     imax tell() const final { return IO_TELL_64(handle.file); }
     bool seek(imax offset, seek_origin origin) final
     {
@@ -221,6 +240,8 @@ struct file_writer : abstract_writer<T>
 {
     file_writer(file_handle&& handle) : handle(std::move(handle)) {}
     ~file_writer() override {}
+
+    using abstract_writer<T>::write;
     size_t write(const T* data, size_t size) final
     {
         return fwrite(data, element_size<T>(), size, handle.file);
diff --git a/include/kfr/io/impl/audiofile-impl.cpp b/include/kfr/io/impl/audiofile-impl.cpp
@@ -25,6 +25,8 @@
  */
 
 #include "../audiofile.hpp"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wimplicit-fallthrough")
 
 #if defined(KFR_ENABLE_WAV) && KFR_ENABLE_WAV
 #define DR_WAV_NO_STDIO
@@ -37,3 +39,5 @@
 #define DR_FLAC_NO_STDIO
 #include "../dr/dr_flac.h"
 #endif
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/io/python_plot.hpp b/include/kfr/io/python_plot.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup plotting
  *  @{
  */
 /*
@@ -24,8 +24,8 @@
   See https://www.kfrlib.com for details.
  */
 #pragma once
-#include "../base/vec.hpp"
 #include "../cometa/string.hpp"
+#include "../simd/vec.hpp"
 #include <cstdlib>
 
 #ifdef CMT_OS_WIN
@@ -38,7 +38,7 @@
 
 namespace kfr
 {
-namespace internal
+namespace internal_generic
 {
 CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wdeprecated-declarations")
@@ -51,7 +51,7 @@ void python(const std::string& name, const std::string& code)
     std::string filename;
     {
         char curdir[1024];
-        cross_getcwd(curdir, arraysize(curdir));
+        (void)cross_getcwd(curdir, arraysize(curdir));
         filename = curdir;
     }
 #ifdef CMT_OS_WIN
@@ -64,7 +64,7 @@ void python(const std::string& name, const std::string& code)
     FILE* f = fopen(filename.c_str(), "w");
     fwrite(code.c_str(), 1, code.size(), f);
     fclose(f);
-    std::system(("python \"" + filename + "\"").c_str());
+    (void)std::system(("python \"" + filename + "\"").c_str());
 }
 CMT_PRAGMA_GNU(GCC diagnostic pop)
 
@@ -78,7 +78,7 @@ inline T flush_to_zero(T value)
 {
     return static_cast<double>(value);
 }
-} // namespace internal
+} // namespace internal_generic
 
 inline std::string concat_args() { return {}; }
 
@@ -106,7 +106,7 @@ void plot_show(const std::string& name, const std::string& wavfile, const std::s
     std::string ss;
     ss += python_prologue() + "dspplot.plot(" + concat_args("r'" + wavfile + "'", options) + ")\n";
 
-    internal::python(name, ss);
+    internal_generic::python(name, ss);
     print("done\n");
 }
 
@@ -125,12 +125,12 @@ void plot_show(const std::string& name, const T& x, const std::string& options =
     std::string ss;
     ss += python_prologue() + "data = [\n";
     for (size_t i = 0; i < array.size(); i++)
-        ss += as_string(fmt<'g', 20, 17>(internal::flush_to_zero(array[i])), ",\n");
+        ss += as_string(fmt<'g', 20, 17>(internal_generic::flush_to_zero(array[i])), ",\n");
     ss += "]\n";
 
     ss += "dspplot.plot(" + concat_args("data", options) + ")\n";
 
-    internal::python(name, ss);
+    internal_generic::python(name, ss);
     print("done\n");
 }
 
@@ -170,7 +170,7 @@ void perfplot_show(const std::string& name, T1&& data, T2&& labels, const std::s
 
     ss += "dspplot.perfplot(" + concat_args("data, labels", options) + ")\n";
 
-    internal::python(name, ss);
+    internal_generic::python(name, ss);
     print("done\n");
 }
 
diff --git a/include/kfr/io/tostring.hpp b/include/kfr/io/tostring.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup string_io
  *  @{
  */
 /*
@@ -25,15 +25,50 @@
  */
 #pragma once
 
-#include "../base/complex.hpp"
 #include "../base/univector.hpp"
-#include "../base/vec.hpp"
 #include "../cometa/string.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/vec.hpp"
 #include <cmath>
 
 namespace cometa
 {
 
+template <>
+struct representation<cometa::special_value>
+{
+    using type = std::string;
+    static std::string get(const cometa::special_value& value)
+    {
+        using cometa::special_constant;
+        switch (value.c)
+        {
+        case special_constant::undefined:
+            return "undefined";
+        case special_constant::default_constructed:
+            return "default_constructed";
+        case special_constant::infinity:
+            return "infinity";
+        case special_constant::neg_infinity:
+            return "neg_infinity";
+        case special_constant::min:
+            return "min";
+        case special_constant::max:
+            return "max";
+        case special_constant::neg_max:
+            return "neg_max";
+        case special_constant::lowest:
+            return "lowest";
+        case special_constant::integer:
+            return as_string(value.ll);
+        case special_constant::floating_point:
+            return as_string(value.d);
+        default:
+            return "unknown";
+        }
+    }
+};
+
 namespace details
 {
 
@@ -157,10 +192,21 @@ struct representation<kfr::univector<T, Tag>>
         return details::array_to_string(value.data(), value.size());
     }
 };
+template <typename T, size_t Size>
+struct representation<std::array<T, Size>>
+{
+    using type = std::string;
+    static std::string get(const std::array<T, Size>& value)
+    {
+        return details::array_to_string(value.data(), value.size());
+    }
+};
 } // namespace cometa
 
 namespace kfr
 {
+inline namespace CMT_ARCH_NAME
+{
 
 namespace internal
 {
@@ -205,6 +251,7 @@ inline internal::expression_printer printer() { return internal::expression_prin
 
 /// @brief Returns an output expression that prints the values with their types (used for debug)
 inline internal::expression_debug_printer debug_printer() { return internal::expression_debug_printer(); }
+} // namespace CMT_ARCH_NAME
 
 /// @brief Converts dB value to string (uses oo for infinity symbol)
 template <typename T>
diff --git a/include/kfr/kfr.h b/include/kfr/kfr.h
@@ -0,0 +1,70 @@
+/** @addtogroup utility
+ *  @{
+ */
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "cident.h"
+
+#define KFR_VERSION_MAJOR 3
+#define KFR_VERSION_MINOR 0
+#define KFR_VERSION_PATCH 5
+#define KFR_VERSION_LABEL "rc"
+
+#define KFR_VERSION_STRING                                                                                   \
+    CMT_STRINGIFY(KFR_VERSION_MAJOR)                                                                         \
+    "." CMT_STRINGIFY(KFR_VERSION_MINOR) "." CMT_STRINGIFY(KFR_VERSION_PATCH) "-" KFR_VERSION_LABEL
+#define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_PATCH)
+
+#if defined DEBUG || defined KFR_DEBUG
+#define KFR_DEBUG_STR " debug"
+#elif defined NDEBUG || defined KFR_NDEBUG
+#define KFR_DEBUG_STR " optimized"
+#else
+#define KFR_DEBUG_STR ""
+#endif
+
+#define KFR_NATIVE_INTRINSICS 1
+
+#if defined CMT_COMPILER_CLANG && !defined CMT_DISABLE_CLANG_EXT
+#define CMT_CLANG_EXT
+#endif
+
+#ifdef KFR_NATIVE_INTRINSICS
+#define KFR_BUILD_DETAILS_1 " +in"
+#else
+#define KFR_BUILD_DETAILS_1 ""
+#endif
+
+#ifdef CMT_CLANG_EXT
+#define KFR_BUILD_DETAILS_2 " +ve"
+#else
+#define KFR_BUILD_DETAILS_2 ""
+#endif
+
+#define KFR_VERSION_FULL                                                                                     \
+    "KFR " KFR_VERSION_STRING KFR_DEBUG_STR                                                                  \
+    " " CMT_STRINGIFY(CMT_ARCH_NAME) " " CMT_ARCH_BITNESS_NAME " (" CMT_COMPILER_FULL_NAME "/" CMT_OS_NAME   \
+                                     ")" KFR_BUILD_DETAILS_1 KFR_BUILD_DETAILS_2
+
+#ifdef __cplusplus
+namespace kfr
+{
+/// @brief KFR version string
+constexpr const char version_string[] = KFR_VERSION_STRING;
+
+constexpr int version_major = KFR_VERSION_MAJOR;
+constexpr int version_minor = KFR_VERSION_MINOR;
+constexpr int version_patch = KFR_VERSION_PATCH;
+constexpr int version       = KFR_VERSION;
+
+/// @brief KFR version string including architecture and compiler name
+constexpr const char version_full[] = KFR_VERSION_FULL;
+} // namespace kfr
+#endif
+
+#define KFR_INTRINSIC CMT_INTRINSIC
+#define KFR_MEM_INTRINSIC CMT_MEM_INTRINSIC
+#define KFR_FUNCTION CMT_FUNCTION
diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp
@@ -22,4 +22,24 @@
  */
 #pragma once
 
-#include "base.hpp"
+#include "simd.hpp"
+
+#include "math/abs.hpp"
+#include "math/asin_acos.hpp"
+#include "math/atan.hpp"
+#include "math/clamp.hpp"
+#include "math/compiletime.hpp"
+#include "math/complex_math.hpp"
+#include "math/gamma.hpp"
+#include "math/hyperbolic.hpp"
+#include "math/interpolation.hpp"
+#include "math/log_exp.hpp"
+#include "math/logical.hpp"
+#include "math/min_max.hpp"
+#include "math/modzerobessel.hpp"
+#include "math/round.hpp"
+#include "math/saturation.hpp"
+#include "math/select.hpp"
+#include "math/sin_cos.hpp"
+#include "math/sqrt.hpp"
+#include "math/tan.hpp"
diff --git a/include/kfr/math/abs.hpp b/include/kfr/math/abs.hpp
@@ -0,0 +1,54 @@
+/** @addtogroup basic_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/abs.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the absolute value of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 abs(const T1& x)
+{
+    return intrinsics::abs(x);
+}
+
+/**
+ * @brief Returns template expression that returns the absolute value of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::abs, E1> abs(E1&& x)
+{
+    return { fn::abs(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/math/asin_acos.hpp b/include/kfr/math/asin_acos.hpp
@@ -0,0 +1,71 @@
+/** @addtogroup trigonometric
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/asin_acos.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the arc sine of x. The returned angle is in the range \f$-\pi/2\f$ through \f$\pi/2\f$.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC flt_type<T1> asin(const T1& x)
+{
+    return intrinsics::asin(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc sine of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::asin, E1> asin(E1&& x)
+{
+    return { fn::asin(), std::forward<E1>(x) };
+}
+/**
+ * @brief Returns the arc cosine of x. The returned angle is in the range 0 through \f$\pi\f$.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC flt_type<T1> acos(const T1& x)
+{
+    return intrinsics::acos(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc cosine of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::acos, E1> acos(E1&& x)
+{
+    return { fn::acos(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/math/atan.hpp b/include/kfr/math/atan.hpp
@@ -0,0 +1,110 @@
+/** @addtogroup trigonometric
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/atan.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the arc tangent of x. The returned angle is in the range \f$-\pi/2\f$ through
+ * \f$\pi/2\f$.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> atan(const T1& x)
+{
+    return intrinsics::atan(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::atan, E1> atan(E1&& x)
+{
+    return { fn::atan(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the arc tangent of the x, expressed in degrees. The returned angle is in the range -90
+ * through 90.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> atandeg(const T1& x)
+{
+    return intrinsics::atandeg(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of the x, expressed in degrees.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::atandeg, E1> atandeg(E1&& x)
+{
+    return { fn::atandeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION common_type<T1, T2> atan2(const T1& x, const T2& y)
+{
+    return intrinsics::atan2(x, y);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of y/x.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::atan2, E1, E2> atan2(E1&& x, E2&& y)
+{
+    return { fn::atan2(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the arc tangent of y/x (expressed in degrees) using the signs of arguments to determine the
+ * correct quadrant.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION common_type<T1, T2> atan2deg(const T1& x, const T2& y)
+{
+    return intrinsics::atan2deg(x, y);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of y/x (expressed in degrees).
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::atan2deg, E1, E2> atan2deg(E1&& x, E2&& y)
+{
+    return { fn::atan2deg(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/clamp.hpp b/include/kfr/math/clamp.hpp
@@ -0,0 +1,65 @@
+/** @addtogroup basic_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/clamp.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the first argument clamped to a range [lo, hi]
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
+          typename Tout = common_type<T1, T2, T3>>
+KFR_INTRINSIC Tout clamp(const T1& x, const T2& lo, const T3& hi)
+{
+    return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(lo), static_cast<Tout>(hi));
+}
+
+/// @brief Creates an expression that returns the first argument clamped to a range [lo, hi]
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::clamp, E1, E2, E3> clamp(E1&& x, E2&& lo, E3&& hi)
+{
+    return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(lo), std::forward<E3>(hi) };
+}
+
+/// @brief Returns the first argument clamped to a range [0, hi]
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout clamp(const T1& x, const T2& hi)
+{
+    return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(hi));
+}
+
+/// @brief Creates an expression that returns the first argument clamped to a range [0, hi]
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::clamp, E1, E2> clamp(E1&& x, E2&& hi)
+{
+    return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(hi) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/compiletime.hpp b/include/kfr/math/compiletime.hpp
@@ -0,0 +1,84 @@
+/** @addtogroup math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "../simd/constants.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/types.hpp"
+
+namespace kfr
+{
+
+namespace compiletime
+{
+
+template <typename T>
+constexpr inline T select(bool c, T x, T y)
+{
+    return c ? x : y;
+}
+template <typename T>
+constexpr inline T trunc(T x)
+{
+    return static_cast<T>(static_cast<long long>(x));
+}
+template <typename T>
+constexpr inline T abs(T x)
+{
+    return x < T() ? -x : x;
+}
+template <typename T>
+constexpr inline T mulsign(T x, T y)
+{
+    return y < T() ? -x : x;
+}
+template <typename T>
+constexpr inline T sin(T x)
+{
+    x              = x - trunc(x / c_pi<T, 2>) * c_pi<T, 2>;
+    constexpr T c2 = -0.16665853559970855712890625;
+    constexpr T c4 = +8.31427983939647674560546875e-3;
+    constexpr T c6 = -1.85423981747590005397796630859375e-4;
+
+    x -= c_pi<T>;
+    T y = abs(x);
+    y   = select(y > c_pi<T, 1, 2>, c_pi<T> - y, y);
+    y   = mulsign(y, -x);
+
+    const T y2 = y * y;
+    T formula  = c6;
+    const T y3 = y2 * y;
+    formula    = fmadd(formula, y2, c4);
+    formula    = fmadd(formula, y2, c2);
+    formula    = formula * y3 + y;
+    return formula;
+}
+template <typename T>
+constexpr inline T cos(T x)
+{
+    return sin(x + c_pi<T, 1, 2>);
+}
+} // namespace compiletime
+} // namespace kfr
diff --git a/include/kfr/math/complex_math.hpp b/include/kfr/math/complex_math.hpp
@@ -0,0 +1,410 @@
+/** @addtogroup complex
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../simd/complex.hpp"
+#include "abs.hpp"
+#include "atan.hpp"
+#include "hyperbolic.hpp"
+#include "log_exp.hpp"
+#include "min_max.hpp"
+#include "select.hpp"
+#include "sin_cos.hpp"
+#include "sqrt.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> csin(const vec<complex<T>, N>& x)
+{
+    return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> csinh(const vec<complex<T>, N>& x)
+{
+    return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> ccos(const vec<complex<T>, N>& x)
+{
+    return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x)
+{
+    return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabs(const vec<complex<T>, N>& x)
+{
+    const vec<T, N* 2> xx = sqr(cdecom(x));
+    return sqrt(even(xx) + odd(xx));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> carg(const vec<complex<T>, N>& x)
+{
+    const vec<T, N* 2> xx = cdecom(x);
+    return atan2(even(xx), odd(xx));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> clog(const vec<complex<T>, N>& x)
+{
+    return make_complex(log(cabs(x)), carg(x));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> clog2(const vec<complex<T>, N>& x)
+{
+    return clog(x) * avoid_odr_use(c_recip_log_2<T>);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> clog10(const vec<complex<T>, N>& x)
+{
+    return clog(x) * avoid_odr_use(c_recip_log_10<T>);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cexp(const vec<complex<T>, N>& x)
+{
+    return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x)
+{
+    return cexp(x * avoid_odr_use(c_log_2<T>));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x)
+{
+    return cexp(x * avoid_odr_use(c_log_10<T>));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> polar(const vec<complex<T>, N>& x)
+{
+    return make_complex(cabs(x), carg(x));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x)
+{
+    return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x))));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabsdup(const vec<T, N>& x)
+{
+    x = sqr(x);
+    return sqrt(x + swap<2>(x));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x)
+{
+    const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5);
+    return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t));
+}
+
+KFR_HANDLE_SCALAR(cconj)
+KFR_HANDLE_SCALAR(csin)
+KFR_HANDLE_SCALAR(csinh)
+KFR_HANDLE_SCALAR(ccos)
+KFR_HANDLE_SCALAR(ccosh)
+KFR_HANDLE_SCALAR(clog)
+KFR_HANDLE_SCALAR(clog2)
+KFR_HANDLE_SCALAR(clog10)
+KFR_HANDLE_SCALAR(cexp)
+KFR_HANDLE_SCALAR(cexp2)
+KFR_HANDLE_SCALAR(cexp10)
+KFR_HANDLE_SCALAR(polar)
+KFR_HANDLE_SCALAR(cartesian)
+KFR_HANDLE_SCALAR(csqrt)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabs(const vec<T, N>& a)
+{
+    return to_scalar(intrinsics::cabs(static_cast<vec<complex<T>, N>>(a)));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> carg(const vec<T, N>& a)
+{
+    return to_scalar(intrinsics::carg(static_cast<vec<complex<T>, N>>(a)));
+}
+template <typename T1>
+KFR_INTRINSIC realtype<T1> cabs(const T1& a)
+{
+    using vecout = vec1<T1>;
+    return to_scalar(intrinsics::cabs(vecout(a)));
+}
+template <typename T1>
+KFR_INTRINSIC realtype<T1> carg(const T1& a)
+{
+    using vecout = vec1<T1>;
+    return to_scalar(intrinsics::carg(vecout(a)));
+}
+} // namespace intrinsics
+
+KFR_I_FN(cconj)
+KFR_I_FN(csin)
+KFR_I_FN(csinh)
+KFR_I_FN(ccos)
+KFR_I_FN(ccosh)
+KFR_I_FN(cabs)
+KFR_I_FN(carg)
+KFR_I_FN(clog)
+KFR_I_FN(clog2)
+KFR_I_FN(clog10)
+KFR_I_FN(cexp)
+KFR_I_FN(cexp2)
+KFR_I_FN(cexp10)
+KFR_I_FN(polar)
+KFR_I_FN(cartesian)
+KFR_I_FN(csqrt)
+
+/// @brief Returns the sine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 csin(const T1& x)
+{
+    return intrinsics::csin(x);
+}
+
+/// @brief Returns template expression that returns the sine of the the complex value x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::csin, E1> csin(E1&& x)
+{
+    return { fn::csin(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic sine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 csinh(const T1& x)
+{
+    return intrinsics::csinh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic sine of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::csinh, E1> csinh(E1&& x)
+{
+    return { fn::csinh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the cosine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 ccos(const T1& x)
+{
+    return intrinsics::ccos(x);
+}
+
+/// @brief Returns template expression that returns the cosine of the the complex value x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::ccos, E1> ccos(E1&& x)
+{
+    return { fn::ccos(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cosine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 ccosh(const T1& x)
+{
+    return intrinsics::ccosh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cosine of the the complex value x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::ccosh, E1> ccosh(E1&& x)
+{
+    return { fn::ccosh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the absolute value (magnitude) of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION realtype<T1> cabs(const T1& x)
+{
+    return intrinsics::cabs(x);
+}
+
+/// @brief Returns template expression that returns the absolute value (magnitude) of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cabs, E1> cabs(E1&& x)
+{
+    return { fn::cabs(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the phase angle (argument) of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION realtype<T1> carg(const T1& x)
+{
+    return intrinsics::carg(x);
+}
+
+/// @brief Returns template expression that returns the phase angle (argument) of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::carg, E1> carg(E1&& x)
+{
+    return { fn::carg(), std::forward<E1>(x) };
+}
+
+/// @brief Returns template expression that returns the complex conjugate of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cconj, E1> cconj(E1&& x)
+{
+    return { fn::cconj(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the natural logarithm of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 clog(const T1& x)
+{
+    return intrinsics::clog(x);
+}
+
+/// @brief Returns template expression that returns the natural logarithm of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::clog, E1> clog(E1&& x)
+{
+    return { fn::clog(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the binary (base-2) logarithm of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 clog2(const T1& x)
+{
+    return intrinsics::clog2(x);
+}
+
+/// @brief Returns template expression that returns the binary (base-2) logarithm of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::clog2, E1> clog2(E1&& x)
+{
+    return { fn::clog2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the common (base-10) logarithm of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 clog10(const T1& x)
+{
+    return intrinsics::clog10(x);
+}
+
+/// @brief Returns template expression that returns the common (base-10) logarithm of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::clog10, E1> clog10(E1&& x)
+{
+    return { fn::clog10(), std::forward<E1>(x) };
+}
+
+/// @brief Returns \f$e\f$ raised to the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cexp(const T1& x)
+{
+    return intrinsics::cexp(x);
+}
+
+/// @brief Returns template expression that returns \f$e\f$ raised to the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cexp, E1> cexp(E1&& x)
+{
+    return { fn::cexp(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 2 raised to the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cexp2(const T1& x)
+{
+    return intrinsics::cexp2(x);
+}
+
+/// @brief Returns template expression that returns 2 raised to the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cexp2, E1> cexp2(E1&& x)
+{
+    return { fn::cexp2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 10 raised to the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cexp10(const T1& x)
+{
+    return intrinsics::cexp10(x);
+}
+
+/// @brief Returns template expression that returns 10 raised to the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cexp10, E1> cexp10(E1&& x)
+{
+    return { fn::cexp10(), std::forward<E1>(x) };
+}
+
+/// @brief Converts complex number to polar
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 polar(const T1& x)
+{
+    return intrinsics::polar(x);
+}
+
+/// @brief Returns template expression that converts complex number to polar
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::polar, E1> polar(E1&& x)
+{
+    return { fn::polar(), std::forward<E1>(x) };
+}
+
+/// @brief Converts complex number to cartesian
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cartesian(const T1& x)
+{
+    return intrinsics::cartesian(x);
+}
+
+/// @brief Returns template expression that converts complex number to cartesian
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cartesian, E1> cartesian(E1&& x)
+{
+    return { fn::cartesian(), std::forward<E1>(x) };
+}
+
+/// @brief Returns square root of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 csqrt(const T1& x)
+{
+    return intrinsics::csqrt(x);
+}
+
+/// @brief Returns template expression that returns square root of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::csqrt, E1> csqrt(E1&& x)
+{
+    return { fn::csqrt(), std::forward<E1>(x) };
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/gamma.hpp b/include/kfr/math/gamma.hpp
@@ -0,0 +1,63 @@
+/** @addtogroup other_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/gamma.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the approximate gamma function of an argument
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> gamma(const T1& x)
+{
+    return intrinsics::gamma(x);
+}
+
+/// @brief Creates expression that returns the approximate gamma function of an argument
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::gamma, E1> gamma(E1&& x)
+{
+    return { fn::gamma(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the approximate factorial of an argument
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> factorial_approx(const T1& x)
+{
+    return intrinsics::factorial_approx(x);
+}
+
+/// @brief Creates expression that returns the approximate factorial of an argument
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::factorial_approx, E1> factorial_approx(E1&& x)
+{
+    return { fn::factorial_approx(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/hyperbolic.hpp b/include/kfr/math/hyperbolic.hpp
@@ -0,0 +1,123 @@
+/** @addtogroup hyperbolic
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/hyperbolic.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the hyperbolic sine of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sinh(const T1& x)
+{
+    return intrinsics::sinh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic sine of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sinh, E1> sinh(E1&& x)
+{
+    return { fn::sinh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cosine of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cosh(const T1& x)
+{
+    return intrinsics::cosh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cosine of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cosh, E1> cosh(E1&& x)
+{
+    return { fn::cosh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic tangent of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> tanh(const T1& x)
+{
+    return intrinsics::tanh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic tangent of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::tanh, E1> tanh(E1&& x)
+{
+    return { fn::tanh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cotangent of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> coth(const T1& x)
+{
+    return intrinsics::coth(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cotangent of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::coth, E1> coth(E1&& x)
+{
+    return { fn::coth(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic sine of the even elements of the x and the hyperbolic cosine of the odd
+/// elements of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sinhcosh(const T1& x)
+{
+    return intrinsics::sinhcosh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic sine of the even elements of the x and the
+/// hyperbolic cosine of the odd elements of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sinhcosh, E1> sinhcosh(E1&& x)
+{
+    return { fn::sinhcosh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cosine of the even elements of the x and the hyperbolic sine of the odd
+/// elements of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> coshsinh(const T1& x)
+{
+    return intrinsics::coshsinh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cosine of the even elements of the x and
+/// the hyperbolic sine of the odd elements of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::coshsinh, E1> coshsinh(E1&& x)
+{
+    return { fn::coshsinh(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/abs.hpp b/include/kfr/math/impl/abs.hpp
@@ -0,0 +1,138 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/select.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+    return x & avoid_odr_use(special_constants<T>::invhighbitmask());
+}
+
+KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT
+{
+    const __m128i sh  = _mm_srai_epi32(x.v, 31);
+    const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+    return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk);
+}
+KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); }
+KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); }
+KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); }
+KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT
+{
+    const __m256i sh  = _mm256_srai_epi32(x.v, 31);
+    const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+    return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk);
+}
+KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); }
+KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>::value)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); }
+KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); }
+KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); }
+#else
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); }
+#endif
+
+KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; }
+
+KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); }
+#else
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT
+{
+    return x & avoid_odr_use(special_constants<f64>::invhighbitmask());
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_1(abs)
+
+#else
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+    return x & avoid_odr_use(special_constants<T>::invhighbitmask());
+}
+
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+    return select(x >= T(0), x, -x);
+}
+#endif
+KFR_HANDLE_SCALAR(abs)
+} // namespace intrinsics
+
+KFR_I_FN(abs)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/asin_acos.hpp b/include/kfr/math/impl/asin_acos.hpp
@@ -0,0 +1,58 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/atan.hpp"
+#include "../../math/select.hpp"
+#include "../../math/sqrt.hpp"
+#include "../../simd/impl/function.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> asin(const vec<T, N>& x)
+{
+    const vec<Tout, N> xx = x;
+    return atan2(xx, sqrt(Tout(1) - xx * xx));
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> acos(const vec<T, N>& x)
+{
+    const vec<Tout, N> xx = x;
+    return atan2(sqrt(Tout(1) - xx * xx), xx);
+}
+KFR_HANDLE_SCALAR(asin)
+KFR_HANDLE_SCALAR(acos)
+} // namespace intrinsics
+KFR_I_FN(asin)
+KFR_I_FN(acos)
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/math/impl/atan.hpp b/include/kfr/math/impl/atan.hpp
@@ -0,0 +1,230 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "../../math/abs.hpp"
+#include "../../math/select.hpp"
+#include "../../math/sin_cos.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx)
+{
+    vec<f32, N> x = xx, y = yy;
+    vec<f32, N> s, t, u;
+    vec<i32, N> q;
+    q = select(x < 0, -2, 0);
+    x = select(x < 0, -x, x);
+    mask<i32, N> m;
+    m = y > x;
+    t = x;
+    x = select(m, y, x);
+    y = select(m, -t, y);
+    q = select(m, q + 1, q);
+    s = y / x;
+    t = s * s;
+    u = 0.00282363896258175373077393f;
+    u = fmadd(u, t, -0.0159569028764963150024414f);
+    u = fmadd(u, t, 0.0425049886107444763183594f);
+    u = fmadd(u, t, -0.0748900920152664184570312f);
+    u = fmadd(u, t, 0.106347933411598205566406f);
+    u = fmadd(u, t, -0.142027363181114196777344f);
+    u = fmadd(u, t, 0.199926957488059997558594f);
+    u = fmadd(u, t, -0.333331018686294555664062f);
+    t = u * t * s + s;
+    t = innercast<f32>(q) * 1.5707963267948966192313216916398f + t;
+    return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx)
+{
+    vec<f64, N> x = xx, y = yy;
+    vec<f64, N> s, t, u;
+    vec<i64, N> q;
+    q = select(x < 0, i64(-2), i64(0));
+    x = select(x < 0, -x, x);
+    mask<i64, N> m;
+    m = y > x;
+    t = x;
+    x = select(m, y, x);
+    y = select(m, -t, y);
+    q = select(m, q + i64(1), q);
+    s = y / x;
+    t = s * s;
+    u = -1.88796008463073496563746e-05;
+    u = fmadd(u, t, 0.000209850076645816976906797);
+    u = fmadd(u, t, -0.00110611831486672482563471);
+    u = fmadd(u, t, 0.00370026744188713119232403);
+    u = fmadd(u, t, -0.00889896195887655491740809);
+    u = fmadd(u, t, 0.016599329773529201970117);
+    u = fmadd(u, t, -0.0254517624932312641616861);
+    u = fmadd(u, t, 0.0337852580001353069993897);
+    u = fmadd(u, t, -0.0407629191276836500001934);
+    u = fmadd(u, t, 0.0466667150077840625632675);
+    u = fmadd(u, t, -0.0523674852303482457616113);
+    u = fmadd(u, t, 0.0587666392926673580854313);
+    u = fmadd(u, t, -0.0666573579361080525984562);
+    u = fmadd(u, t, 0.0769219538311769618355029);
+    u = fmadd(u, t, -0.090908995008245008229153);
+    u = fmadd(u, t, 0.111111105648261418443745);
+    u = fmadd(u, t, -0.14285714266771329383765);
+    u = fmadd(u, t, 0.199999999996591265594148);
+    u = fmadd(u, t, -0.333333333333311110369124);
+    t = u * t * s + s;
+    t = innercast<f64>(q) * 1.5707963267948966192313216916398 + t;
+    return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x)
+{
+    vec<f32, N> r           = atan2k(abs(y), x);
+    constexpr f32 pi        = 3.1415926535897932384626433832795f;
+    constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f;
+    constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f;
+    r                       = mulsign(r, x);
+    r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
+    r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
+    r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r);
+    r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
+    return r;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
+{
+    vec<f64, N> r           = atan2k(abs(y), x);
+    constexpr f64 pi        = 3.1415926535897932384626433832795;
+    constexpr f64 pi_over_2 = 1.5707963267948966192313216916398;
+    constexpr f64 pi_over_4 = 0.78539816339744830961566084581988;
+    r                       = mulsign(r, x);
+    r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r);
+    r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r);
+    r = select(y == 0.0, select(x < 0., pi, 0.), r);
+    r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
+    return r;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan(const vec<f32, N>& x)
+{
+    vec<f32, N> t, u;
+    vec<i32, N> q;
+    q             = select(x < 0.f, 2, 0);
+    vec<f32, N> s = select(x < 0.f, -x, x);
+    q             = select(s > 1.f, q | 1, q);
+    s             = select(s > 1.f, 1.0f / s, s);
+    t             = s * s;
+    u             = 0.00282363896258175373077393f;
+    u             = fmadd(u, t, -0.0159569028764963150024414f);
+    u             = fmadd(u, t, 0.0425049886107444763183594f);
+    u             = fmadd(u, t, -0.0748900920152664184570312f);
+    u             = fmadd(u, t, 0.106347933411598205566406f);
+    u             = fmadd(u, t, -0.142027363181114196777344f);
+    u             = fmadd(u, t, 0.199926957488059997558594f);
+    u             = fmadd(u, t, -0.333331018686294555664062f);
+    t             = s + s * (t * u);
+    t             = select((q & 1) != 0, 1.570796326794896557998982f - t, t);
+    t             = select((q & 2) != 0, -t, t);
+    return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan(const vec<f64, N>& x)
+{
+    vec<f64, N> t, u;
+    vec<i64, N> q;
+    q             = select(x < 0.0, i64(2), i64(0));
+    vec<f64, N> s = select(x < 0.0, -x, x);
+    q             = select(s > 1.0, q | 1, q);
+    s             = select(s > 1.0, 1.0 / s, s);
+    t             = s * s;
+    u             = -1.88796008463073496563746e-05;
+    u             = fmadd(u, t, 0.000209850076645816976906797);
+    u             = fmadd(u, t, -0.00110611831486672482563471);
+    u             = fmadd(u, t, 0.00370026744188713119232403);
+    u             = fmadd(u, t, -0.00889896195887655491740809);
+    u             = fmadd(u, t, 0.016599329773529201970117);
+    u             = fmadd(u, t, -0.0254517624932312641616861);
+    u             = fmadd(u, t, 0.0337852580001353069993897);
+    u             = fmadd(u, t, -0.0407629191276836500001934);
+    u             = fmadd(u, t, 0.0466667150077840625632675);
+    u             = fmadd(u, t, -0.0523674852303482457616113);
+    u             = fmadd(u, t, 0.0587666392926673580854313);
+    u             = fmadd(u, t, -0.0666573579361080525984562);
+    u             = fmadd(u, t, 0.0769219538311769618355029);
+    u             = fmadd(u, t, -0.090908995008245008229153);
+    u             = fmadd(u, t, 0.111111105648261418443745);
+    u             = fmadd(u, t, -0.14285714266771329383765);
+    u             = fmadd(u, t, 0.199999999996591265594148);
+    u             = fmadd(u, t, -0.333333333333311110369124);
+    t             = s + s * (t * u);
+    t             = select((q & 1) != 0, 1.570796326794896557998982 - t, t);
+    t             = select((q & 2) != 0, -t, t);
+    return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atandeg(const vec<f32, N>& x)
+{
+    return atan(x) * c_radtodeg<f32>;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atandeg(const vec<f64, N>& x)
+{
+    return atan(x) * c_radtodeg<f64>;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan2deg(const vec<f32, N>& y, const vec<f32, N>& x)
+{
+    return atan2(y, x) * c_radtodeg<f32>;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan2deg(const vec<f64, N>& y, const vec<f64, N>& x)
+{
+    return atan2(y, x) * c_radtodeg<f64>;
+}
+
+KFR_HANDLE_SCALAR(atan)
+KFR_HANDLE_SCALAR(atan2)
+KFR_HANDLE_SCALAR(atandeg)
+KFR_HANDLE_SCALAR(atan2deg)
+} // namespace intrinsics
+KFR_I_FN(atan)
+KFR_I_FN(atandeg)
+KFR_I_FN(atan2)
+KFR_I_FN(atan2deg)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/clamp.hpp b/include/kfr/math/impl/clamp.hpp
@@ -0,0 +1,55 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/min_max.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T>
+KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi)
+{
+    return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
+{
+    return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
+{
+    return max(min(x, hi), zerovector<T, N>());
+}
+} // namespace intrinsics
+KFR_I_FN(clamp)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/gamma.hpp b/include/kfr/math/impl/gamma.hpp
@@ -0,0 +1,71 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "../../math/log_exp.hpp"
+#include "../../simd/impl/function.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wc99-extensions")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <typename T>
+constexpr T gamma_precalc[] = {
+    0x2.81b263fec4e08p+0,  0x3.07b4100e04448p+16, -0xa.a0da01d4d4e2p+16, 0xf.05ccb27bb9dbp+16,
+    -0xa.fa79616b7c6ep+16, 0x4.6dd6c10d4df5p+16,  -0xf.a2304199eb4ap+12, 0x1.c21dd4aade3dp+12,
+    -0x1.62f981f01cf84p+8, 0x5.a937aa5c48d98p+0,  -0x3.c640bf82e2104p-8, 0xc.914c540f959cp-24,
+};
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> gamma(const vec<T, N>& z)
+{
+    constexpr size_t Count = arraysize(gamma_precalc<T>);
+    vec<T, N> accm         = gamma_precalc<T>[0];
+    CMT_LOOP_UNROLL
+    for (size_t k = 1; k < Count; k++)
+        accm += gamma_precalc<T>[k] / (z + innercast<utype<T>>(k));
+    accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5);
+    return accm / z;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> factorial_approx(const vec<T, N>& x)
+{
+    return gamma(x + T(1));
+}
+KFR_HANDLE_SCALAR(gamma)
+KFR_HANDLE_SCALAR(factorial_approx)
+} // namespace intrinsics
+KFR_I_FN(gamma)
+KFR_I_FN(factorial_approx)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/math/impl/hyperbolic.hpp b/include/kfr/math/impl/hyperbolic.hpp
@@ -0,0 +1,99 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/log_exp.hpp"
+#include "../../math/min_max.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> sinh(const vec<T, N>& x)
+{
+    const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
+    return (exp(xx) - exp(-xx)) * Tout(0.5);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> cosh(const vec<T, N>& x)
+{
+    const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
+    return (exp(xx) + exp(-xx)) * Tout(0.5);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> tanh(const vec<T, N>& x)
+{
+    const vec<Tout, N> a = exp(2 * x);
+    return (a - 1) / (a + 1);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> coth(const vec<T, N>& x)
+{
+    const vec<Tout, N> a = exp(2 * x);
+    return (a + 1) / (a - 1);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> sinhcosh(const vec<T, N>& x)
+{
+    const vec<Tout, N> a = exp(x);
+    const vec<Tout, N> b = exp(-x);
+    return subadd(a, b) * Tout(0.5);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> coshsinh(const vec<T, N>& x)
+{
+    const vec<Tout, N> a = exp(x);
+    const vec<Tout, N> b = exp(-x);
+    return addsub(a, b) * Tout(0.5);
+}
+
+KFR_HANDLE_SCALAR_1_T(sinh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cosh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(tanh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(coth, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(sinhcosh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(coshsinh, flt_type<T>)
+} // namespace intrinsics
+KFR_I_FN(sinh)
+KFR_I_FN(cosh)
+KFR_I_FN(tanh)
+KFR_I_FN(coth)
+KFR_I_FN(sinhcosh)
+KFR_I_FN(coshsinh)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/log_exp.hpp b/include/kfr/math/impl/log_exp.hpp
@@ -0,0 +1,335 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/clamp.hpp"
+#include "../../math/min_max.hpp"
+#include "../../math/round.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+#include "../../simd/shuffle.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <size_t N>
+KFR_INTRINSIC vec<i32, N> vilogbp1(const vec<f32, N>& d)
+{
+    mask<i32, N> m = d < 5.421010862427522E-20f;
+    vec<i32, N> q  = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff;
+    q              = select(m, q - (64 + 0x7e), q - 0x7e);
+    return q;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<i64, N> vilogbp1(const vec<f64, N>& d)
+{
+    mask<i64, N> m = d < 4.9090934652977266E-91;
+    vec<i64, N> q  = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff;
+    q              = select(m, q - (300 + 0x03fe), q - 0x03fe);
+    return q;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q)
+{
+    vec<i32, N> m        = q >> 31;
+    m                    = (((m + q) >> 6) - m) << 4;
+    const vec<i32, N> qq = q - (m << 2);
+    m                    = clamp(m + 0x7f, vec<i32, N>(0xff));
+    vec<f32, N> u        = pow4(bitcast<f32>(innercast<i32>(m) << 23));
+    return x * u * bitcast<f32>((innercast<i32>(qq + 0x7f)) << 23);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q)
+{
+    vec<i64, N> m        = q >> 31;
+    m                    = (((m + q) >> 9) - m) << 7;
+    const vec<i64, N> qq = q - (m << 2);
+    m                    = clamp(m + 0x3ff, i64(0x7ff));
+    vec<f64, N> u        = pow4(bitcast<f64>(innercast<i64>(m) << 52));
+    return x * u * bitcast<f64>((innercast<i64>(qq + 0x3ff)) << 52);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> logb(const vec<T, N>& x)
+{
+    return select(x == T(), -avoid_odr_use(c_infinity<T>), static_cast<vec<T, N>>(vilogbp1(x) - 1));
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> log(const vec<f32, N>& d)
+{
+    vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f );
+    vec<f32, N> m = vldexpk(d, -e);
+
+    vec<f32, N> x  = (m - 1.0f) / (m + 1.0f);
+    vec<f32, N> x2 = x * x;
+
+    vec<f32, N> sp =
+        select(d < 0, avoid_odr_use(constants<f32>::qnan), avoid_odr_use(constants<f32>::neginfinity));
+
+    vec<f32, N> t = 0.2371599674224853515625f;
+    t             = fmadd(t, x2, 0.285279005765914916992188f);
+    t             = fmadd(t, x2, 0.400005519390106201171875f);
+    t             = fmadd(t, x2, 0.666666567325592041015625f);
+    t             = fmadd(t, x2, 2.0f);
+
+    x = x * t + c_log_2<f32> * innercast<f32>(e);
+    x = select(d > 0, x, sp);
+
+    return x;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> log(const vec<f64, N>& d)
+{
+    vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 );
+    vec<f64, N> m = vldexpk(d, -e);
+
+    vec<f64, N> x  = (m - 1.0) / (m + 1.0);
+    vec<f64, N> x2 = x * x;
+
+    vec<f64, N> sp =
+        select(d < 0, avoid_odr_use(constants<f64>::qnan), avoid_odr_use(constants<f64>::neginfinity));
+
+    vec<f64, N> t = 0.148197055177935105296783;
+    t             = fmadd(t, x2, 0.153108178020442575739679);
+    t             = fmadd(t, x2, 0.181837339521549679055568);
+    t             = fmadd(t, x2, 0.22222194152736701733275);
+    t             = fmadd(t, x2, 0.285714288030134544449368);
+    t             = fmadd(t, x2, 0.399999999989941956712869);
+    t             = fmadd(t, x2, 0.666666666666685503450651);
+    t             = fmadd(t, x2, 2);
+
+    x = x * t + avoid_odr_use(constants<f64>::log_2) * innercast<f64>(e);
+    x = select(d > 0, x, sp);
+
+    return x;
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> log2(const vec<T, N>& x)
+{
+    return log(innercast<Tout>(x)) * avoid_odr_use(constants<Tout>::recip_log_2);
+}
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> log10(const vec<T, N>& x)
+{
+    return log(innercast<Tout>(x)) * avoid_odr_use(constants<Tout>::recip_log_10);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> exp(const vec<f32, N>& d)
+{
+    const f32 ln2_part1 = 0.6931457519f;
+    const f32 ln2_part2 = 1.4286067653e-6f;
+
+    vec<i32, N> q = innercast<i32>(floor(d * avoid_odr_use(constants<f32>::recip_log_2)));
+    vec<f32, N> s, u;
+
+    s = fmadd(innercast<f32>(q), -ln2_part1, d);
+    s = fmadd(innercast<f32>(q), -ln2_part2, s);
+
+    const f32 c2 = 0.4999999105930328369140625f;
+    const f32 c3 = 0.166668415069580078125f;
+    const f32 c4 = 4.16539050638675689697265625e-2f;
+    const f32 c5 = 8.378830738365650177001953125e-3f;
+    const f32 c6 = 1.304379315115511417388916015625e-3f;
+    const f32 c7 = 2.7555381529964506626129150390625e-4f;
+
+    u = c7;
+    u = fmadd(u, s, c6);
+    u = fmadd(u, s, c5);
+    u = fmadd(u, s, c4);
+    u = fmadd(u, s, c3);
+    u = fmadd(u, s, c2);
+
+    u = s * s * u + s + 1.0f;
+    u = vldexpk(u, q);
+
+    u = select(d == avoid_odr_use(constants<f32>::neginfinity), 0.f, u);
+
+    return u;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> exp(const vec<f64, N>& d)
+{
+    const f64 ln2_part1 = 0.69314717501401901245;
+    const f64 ln2_part2 = 5.545926273775592108e-009;
+
+    vec<i64, N> q = innercast<i64>(floor(d * avoid_odr_use(constants<f64>::recip_log_2)));
+    vec<f64, N> s, u;
+
+    s = fmadd(innercast<f64>(q), -ln2_part1, d);
+    s = fmadd(innercast<f64>(q), -ln2_part2, s);
+
+    const f64 c2  = 0.499999999999994948485237955537741072475910186767578;
+    const f64 c3  = 0.166666666667024204739888659787538927048444747924805;
+    const f64 c4  = 4.16666666578945840693215529881854308769106864929199e-2;
+    const f64 c5  = 8.3333334397461874404333670440792047884315252304077e-3;
+    const f64 c6  = 1.3888881489747750223179290074426717183087021112442e-3;
+    const f64 c7  = 1.9841587032493949419205414574918222569976933300495e-4;
+    const f64 c8  = 2.47929324077393282239802768662784160369483288377523e-5;
+    const f64 c9  = 2.77076037925831049422552981864598109496000688523054e-6;
+    const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7;
+    const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8;
+
+    u = c11;
+    u = fmadd(u, s, c10);
+    u = fmadd(u, s, c9);
+    u = fmadd(u, s, c8);
+    u = fmadd(u, s, c7);
+    u = fmadd(u, s, c6);
+    u = fmadd(u, s, c5);
+    u = fmadd(u, s, c4);
+    u = fmadd(u, s, c3);
+    u = fmadd(u, s, c2);
+
+    u = s * s * u + s + 1.0;
+    u = vldexpk(u, q);
+
+    u = select(d == avoid_odr_use(constants<f64>::neginfinity), 0.0, u);
+
+    return u;
+}
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> exp2(const vec<T, N>& x)
+{
+    return exp(x * avoid_odr_use(constants<Tout>::log_2));
+}
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> exp10(const vec<T, N>& x)
+{
+    return exp(x * avoid_odr_use(constants<Tout>::log_10));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b)
+{
+    const vec<T, N> t       = exp(b * log(abs(a)));
+    const mask<T, N> isint  = floor(b) == b;
+    const mask<T, N> iseven = (innercast<itype<T>>(b) & 1) == 0;
+    return select(
+        a > T(), t,
+        select(a == T(), T(),
+               select(isint, select(iseven, t, -t), broadcast<N>(avoid_odr_use(constants<T>::qnan)))));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b)
+{
+    return exp(reciprocal(b) * log(x));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cbrt(const vec<T, N>& x)
+{
+    return pow<T, N>(x, T(0.333333333333333333333333333333333));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> cbrt(const vec<T, N>& x)
+{
+    return cbrt(innercast<Tout>(x));
+}
+
+KFR_HANDLE_SCALAR_1_T(exp, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(exp2, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(exp10, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(log, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(log2, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(log10, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(logb, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(pow, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(root, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cbrt, flt_type<T>)
+
+KFR_HANDLE_ARGS_T(exp, flt_type<T>)
+KFR_HANDLE_ARGS_T(exp2, flt_type<T>)
+KFR_HANDLE_ARGS_T(exp10, flt_type<T>)
+KFR_HANDLE_ARGS_T(log, flt_type<T>)
+KFR_HANDLE_ARGS_T(log2, flt_type<T>)
+KFR_HANDLE_ARGS_T(log10, flt_type<T>)
+KFR_HANDLE_ARGS_T(logb, flt_type<T>)
+KFR_HANDLE_ARGS_T(pow, flt_type<T>)
+KFR_HANDLE_ARGS_T(root, flt_type<T>)
+KFR_HANDLE_ARGS_T(cbrt, flt_type<T>)
+
+KFR_HANDLE_NOT_F_1(exp)
+KFR_HANDLE_NOT_F_1(log)
+KFR_HANDLE_NOT_F_1(logb)
+KFR_HANDLE_NOT_F_1(pow)
+KFR_HANDLE_NOT_F_1(root)
+KFR_HANDLE_NOT_F_1(cbrt)
+
+template <typename T1, typename T2>
+KFR_INTRINSIC flt_type<common_type<T1, T2>> logn(const T1& a, const T2& b)
+{
+    return log(a) / log(b);
+}
+
+template <typename T1, typename T2>
+KFR_INTRINSIC flt_type<common_type<T1, T2>> logm(const T1& a, const T2& b)
+{
+    return log(a) * b;
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& m, const T3& a)
+{
+    return exp(fmadd(x, m, a));
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& m, const T3& a)
+{
+    return fmadd(log(x), m, a);
+}
+} // namespace intrinsics
+KFR_I_FN(exp)
+KFR_I_FN(exp2)
+KFR_I_FN(exp10)
+KFR_I_FN(log)
+KFR_I_FN(log2)
+KFR_I_FN(log10)
+KFR_I_FN(logb)
+KFR_I_FN(logn)
+KFR_I_FN(logm)
+KFR_I_FN(exp_fmadd)
+KFR_I_FN(log_fmadd)
+KFR_I_FN(pow)
+KFR_I_FN(root)
+KFR_I_FN(cbrt)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp
@@ -0,0 +1,278 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+#if defined CMT_ARCH_SSE41
+
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const u8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const u8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+#endif
+
+#if defined CMT_ARCH_AVX
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const f32sse& x) { return !_mm_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const f64sse& x) { return !_mm_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const f32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const f64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const u8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const f32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const f64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const f32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const f64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const u8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+
+#if defined CMT_ARCH_AVX512
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const f32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const f64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const u8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const u16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const u32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const u64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i64avx512& x) { return _mm512_movepi64_mask(x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const f32avx512& x) { return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestall(const f64avx512& x) { return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestall(const u8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const u16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const u32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const u64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const i8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const i16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const i32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const i64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+
+#endif
+
+#elif defined CMT_ARCH_SSE41
+KFR_INTRINSIC bool bittestany(const f32sse& x)
+{
+    return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
+}
+KFR_INTRINSIC bool bittestany(const f64sse& x)
+{
+    return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
+}
+KFR_INTRINSIC bool bittestall(const f32sse& x)
+{
+    return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
+}
+KFR_INTRINSIC bool bittestall(const f64sse& x)
+{
+    return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
+}
+#endif
+
+#if !defined CMT_ARCH_SSE41
+
+KFR_INTRINSIC bool bittestany(const f32sse& x) { return _mm_movemask_ps(x.v); }
+KFR_INTRINSIC bool bittestany(const f64sse& x) { return _mm_movemask_pd(x.v); }
+KFR_INTRINSIC bool bittestany(const u8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const u16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const u32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const u64sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i64sse& x) { return _mm_movemask_epi8(x.v); }
+
+KFR_INTRINSIC bool bittestall(const f32sse& x) { return !_mm_movemask_ps((~x).v); }
+KFR_INTRINSIC bool bittestall(const f64sse& x) { return !_mm_movemask_pd((~x).v); }
+KFR_INTRINSIC bool bittestall(const u8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const u16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const u32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const u64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i64sse& x) { return !_mm_movemask_epi8((~x).v); }
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+    return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+    return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+    return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+    return bittestany(low(a)) || bittestany(high(a));
+}
+
+#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC bool bittestall(const u32neon& a)
+{
+    const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+    return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
+}
+
+KFR_INTRINSIC bool bittestany(const u32neon& a)
+{
+    const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+    return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
+}
+KFR_INTRINSIC bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); }
+
+KFR_INTRINSIC bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+    return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+    return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+    return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+    return bittestany(low(a)) || bittestany(high(a));
+}
+
+#else
+
+template <typename T, size_t N>
+KFR_INTRINSIC bitmask<N> getmask(const vec<T, N>& x)
+{
+    typename bitmask<N>::type val = 0;
+    for (size_t i = 0; i < N; i++)
+    {
+        val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
+    }
+    return val;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& x)
+{
+    return getmask(x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return bittestany(x & y);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& x)
+{
+    return !getmask(~x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return !bittestany(~x & y);
+}
+#endif
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/min_max.hpp b/include/kfr/math/impl/min_max.hpp
@@ -0,0 +1,236 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); }
+
+KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); }
+KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); }
+#else
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
+#endif
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); }
+#endif
+
+#if defined CMT_ARCH_SSE41
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); }
+#else
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
+
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
+
+KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); }
+KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); }
+#else
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return select(x < y, x, y);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return select(x > y, x, y);
+}
+#endif
+
+template <typename T>
+KFR_INTRINSIC T min(initialvalue<T>)
+{
+    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+                                                : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T max(initialvalue<T>)
+{
+    return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
+                                                : std::numeric_limits<T>::min();
+}
+template <typename T>
+KFR_INTRINSIC T absmin(initialvalue<T>)
+{
+    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+                                                : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T absmax(initialvalue<T>)
+{
+    return 0;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return min(abs(x), abs(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return max(abs(x), abs(y));
+}
+
+KFR_HANDLE_SCALAR(min)
+KFR_HANDLE_SCALAR(max)
+KFR_HANDLE_SCALAR(absmin)
+KFR_HANDLE_SCALAR(absmax)
+} // namespace intrinsics
+KFR_I_FN(min)
+KFR_I_FN(max)
+KFR_I_FN(absmin)
+KFR_I_FN(absmax)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/modzerobessel.hpp b/include/kfr/math/impl/modzerobessel.hpp
@@ -0,0 +1,104 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/log_exp.hpp"
+#include "../../simd/impl/function.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wc99-extensions")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> modzerobessel(const vec<T, N>& x)
+{
+    constexpr static T bessel_coef[] = { T(0.25),
+                                         T(0.027777777777777776236),
+                                         T(0.0017361111111111110147),
+                                         T(6.9444444444444444384e-005),
+                                         T(1.9290123456790123911e-006),
+                                         T(3.9367598891408417495e-008),
+                                         T(6.1511873267825652335e-010),
+                                         T(7.5940584281266239246e-012),
+                                         T(7.5940584281266233693e-014),
+                                         T(6.2760813455591932909e-016),
+                                         T(4.3583898233049949985e-018),
+                                         T(2.5789288895295827557e-020),
+                                         T(1.3157800456783586208e-022),
+                                         T(5.8479113141260384983e-025),
+                                         T(2.2843403570804837884e-027),
+                                         T(7.904291893012054025e-030),
+                                         T(2.4395962632753252792e-032),
+                                         T(6.75788438580422547e-035),
+                                         T(1.689471096451056426e-037),
+                                         T(3.8310002187098784929e-040),
+                                         T(7.9152897080782616517e-043),
+                                         T(1.4962740468957016443e-045),
+                                         T(2.5976979980828152196e-048),
+                                         T(4.1563167969325041577e-051),
+                                         T(6.1483976285983795968e-054),
+                                         T(8.434015951438105991e-057),
+                                         T(1.0757673407446563809e-059),
+                                         T(1.2791526049282476926e-062),
+                                         T(1.4212806721424974034e-065),
+                                         T(1.4789601166935457918e-068),
+                                         T(1.4442969889585408123e-071),
+                                         T(1.3262598613026086927e-074),
+                                         T(1.1472836170437790782e-077),
+                                         T(9.3655805472961564331e-081),
+                                         T(7.2265282000741942594e-084),
+                                         T(5.2786911614858977913e-087),
+                                         T(3.6556032974279072401e-090),
+                                         T(2.4034209713529963119e-093),
+                                         T(1.5021381070956226783e-096) };
+
+    const vec<T, N> x_2     = x * 0.5;
+    const vec<T, N> x_2_sqr = x_2 * x_2;
+    vec<T, N> num           = x_2_sqr;
+    vec<T, N> result;
+    result = 1 + x_2_sqr;
+
+    CMT_LOOP_UNROLL
+    for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
+    {
+        result = fmadd((num *= x_2_sqr), bessel_coef[i], result);
+    }
+    return result;
+}
+
+KFR_HANDLE_SCALAR(modzerobessel)
+} // namespace intrinsics
+KFR_I_FN(modzerobessel)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/math/impl/round.hpp b/include/kfr/math/impl/round.hpp
@@ -0,0 +1,282 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+#include "abs.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ss(V)                                                                            \
+    _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_sd(V)                                                                            \
+    _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
+#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
+
+#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); }
+KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); }
+KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); }
+KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); }
+KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); }
+KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); }
+KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); }
+KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); }
+
+#if defined CMT_ARCH_AVX
+
+KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); }
+KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); }
+KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); }
+KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); }
+KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); }
+KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); }
+KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+
+KFR_INTRINSIC f32avx512 floor(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 ceil(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 trunc(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 round(const f32avx512& value)
+{
+    return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 floor(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 ceil(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 trunc(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 round(const f64avx512& value)
+{
+    return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>::value)
+
+#else
+
+// fallback
+
+template <typename T>
+constexpr T fp_precision_limit = 4503599627370496.0;
+template <>
+constexpr f32 fp_precision_limit<f32> = 16777216.0f;
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x)
+{
+    vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+    return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x)
+{
+    vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+    return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x)
+{
+    vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+    return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x)
+{
+    vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+    return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f32>, x,
+                  innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f64>, x,
+                  innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x)
+{
+    return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x)
+{
+    return x - floor(x);
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x)
+{
+    return x - floor(x);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value)
+{
+    return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&)
+{
+    return T(0);
+}
+
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value)
+{
+    return innercast<IT>(floor(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value)
+{
+    return innercast<IT>(ceil(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value)
+{
+    return innercast<IT>(trunc(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value)
+{
+    return innercast<IT>(round(value));
+}
+
+KFR_HANDLE_SCALAR(floor)
+KFR_HANDLE_SCALAR(ceil)
+KFR_HANDLE_SCALAR(round)
+KFR_HANDLE_SCALAR(trunc)
+KFR_HANDLE_SCALAR(fract)
+KFR_HANDLE_SCALAR(ifloor)
+KFR_HANDLE_SCALAR(iceil)
+KFR_HANDLE_SCALAR(iround)
+KFR_HANDLE_SCALAR(itrunc)
+} // namespace intrinsics
+KFR_I_FN(floor)
+KFR_I_FN(ceil)
+KFR_I_FN(round)
+KFR_I_FN(trunc)
+KFR_I_FN(fract)
+KFR_I_FN(ifloor)
+KFR_I_FN(iceil)
+KFR_I_FN(iround)
+KFR_I_FN(itrunc)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+#undef KFR_mm_trunc_ps
+#undef KFR_mm_roundnearest_ps
+#undef KFR_mm_trunc_pd
+#undef KFR_mm_roundnearest_pd
+#undef KFR_mm_trunc_ss
+#undef KFR_mm_roundnearest_ss
+#undef KFR_mm_trunc_sd
+#undef KFR_mm_roundnearest_sd
+#undef KFR_mm_floor_ss
+#undef KFR_mm_floor_sd
+#undef KFR_mm_ceil_ss
+#undef KFR_mm_ceil_sd
+#undef KFR_mm256_trunc_ps
+#undef KFR_mm256_roundnearest_ps
+#undef KFR_mm256_trunc_pd
+#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/math/impl/saturation.hpp b/include/kfr/math/impl/saturation.hpp
@@ -0,0 +1,205 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/select.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+// Generic functions
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+    using UT               = utype<T>;
+    constexpr size_t shift = typebits<UT>::bits - 1;
+    vec<UT, N> aa          = bitcast<UT>(a);
+    vec<UT, N> bb          = bitcast<UT>(b);
+    const vec<UT, N> sum   = aa + bb;
+    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+    return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    using UT               = utype<T>;
+    constexpr size_t shift = typebits<UT>::bits - 1;
+    vec<UT, N> aa          = bitcast<UT>(a);
+    vec<UT, N> bb          = bitcast<UT>(b);
+    const vec<UT, N> diff  = aa - bb;
+    aa                     = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+    return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+    const vec<T, N> t = allonesvector(a);
+    return select(a > t - b, t, a + b);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return select(a < b, zerovector(a), a - b);
+}
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); }
+KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b)
+{
+    return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b)
+{
+    return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b)
+{
+    return saturated_unsigned_sub(a, b);
+}
+KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b)
+{
+    return saturated_unsigned_sub(a, b);
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); }
+
+KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); }
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#else
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_signed_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_unsigned_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_signed_sub(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return saturated_unsigned_sub(a, b);
+}
+#endif
+KFR_HANDLE_SCALAR(satadd)
+KFR_HANDLE_SCALAR(satsub)
+} // namespace intrinsics
+KFR_I_FN(satadd)
+KFR_I_FN(satsub)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp
@@ -0,0 +1,329 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse select(const u8sse& m, const u8sse& x, const u8sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16sse select(const u16sse& m, const u16sse& x, const u16sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32sse select(const u32sse& m, const u32sse& x, const u32sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64sse select(const u64sse& m, const u64sse& x, const u64sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8sse select(const i8sse& m, const i8sse& x, const i8sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16sse select(const i16sse& m, const i16sse& x, const i16sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32sse select(const i32sse& m, const i32sse& x, const i32sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64sse select(const i64sse& m, const i64sse& x, const i64sse& y)
+{
+    return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32sse select(const f32sse& m, const f32sse& x, const f32sse& y)
+{
+    return _mm_blendv_ps(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f64sse select(const f64sse& m, const f64sse& x, const f64sse& y)
+{
+    return _mm_blendv_pd(y.v, x.v, m.v);
+}
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f64avx select(const f64avx& m, const f64avx& x, const f64avx& y)
+{
+    return _mm256_blendv_pd(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32avx select(const f32avx& m, const f32avx& x, const f32avx& y)
+{
+    return _mm256_blendv_ps(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx select(const u8avx& m, const u8avx& x, const u8avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16avx select(const u16avx& m, const u16avx& x, const u16avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32avx select(const u32avx& m, const u32avx& x, const u32avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64avx select(const u64avx& m, const u64avx& x, const u64avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8avx select(const i8avx& m, const i8avx& x, const i8avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16avx select(const i16avx& m, const i16avx& x, const i16avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32avx select(const i32avx& m, const i32avx& x, const i32avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64avx select(const i64avx& m, const i64avx& x, const i64avx& y)
+{
+    return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f64avx512 select(const f64avx512& m, const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC f32avx512 select(const f32avx512& m, const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC u8avx512 select(const u8avx512& m, const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u16avx512 select(const u16avx512& m, const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u32avx512 select(const u32avx512& m, const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u64avx512 select(const u64avx512& m, const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i8avx512 select(const i8avx512& m, const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i16avx512 select(const i16avx512& m, const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i32avx512 select(const i32avx512& m, const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i64avx512 select(const i64avx512& m, const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+        .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    vec<T, N> r;
+    intrin(r, a, b, c, [](auto x, auto y, auto z) { return intrinsics::select(x, y, z); });
+    return r;
+    //    return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
+    //    return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+{
+    return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+{
+    return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
+        .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+{
+    return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
+}
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32neon select(const f32neon& m, const f32neon& x, const f32neon& y)
+{
+    return vbslq_f32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i8neon select(const i8neon& m, const i8neon& x, const i8neon& y)
+{
+    return vbslq_s8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u8neon select(const u8neon& m, const u8neon& x, const u8neon& y)
+{
+    return vbslq_u8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i16neon select(const i16neon& m, const i16neon& x, const i16neon& y)
+{
+    return vbslq_s16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u16neon select(const u16neon& m, const u16neon& x, const u16neon& y)
+{
+    return vbslq_u16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i32neon select(const i32neon& m, const i32neon& x, const i32neon& y)
+{
+    return vbslq_s32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u32neon select(const u32neon& m, const u32neon& x, const u32neon& y)
+{
+    return vbslq_u32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i64neon select(const i64neon& m, const i64neon& x, const i64neon& y)
+{
+    return vbslq_s64(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u64neon select(const u64neon& m, const u64neon& x, const u64neon& y)
+{
+    return vbslq_u64(m.v, x.v, y.v);
+}
+
+#ifdef CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+{
+    return vbslq_f64(m.v, x.v, y.v);
+}
+#else
+KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+{
+    return y ^ ((x ^ y) & m);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    constexpr size_t Nout = next_simd_width<T>(N);
+    return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+        .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+    return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+{
+    return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+{
+    return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+{
+    return select(m, vec<T, N>(x), y);
+}
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
+{
+    return y ^ ((x ^ y) & m);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+{
+    return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+{
+    return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+{
+    return select(m, vec<T, N>(x), y);
+}
+#endif
+
+} // namespace intrinsics
+KFR_I_FN(select)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/sin_cos.hpp b/include/kfr/math/impl/sin_cos.hpp
@@ -0,0 +1,310 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/min_max.hpp"
+#include "../../math/round.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+#include "../../simd/shuffle.hpp"
+
+#if CMT_HAS_WARNING("-Wc99-extensions")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0)
+{
+    return select(msk, a0, b0);
+}
+
+template <typename T, size_t N, typename... Ts>
+KFR_INTRINSIC vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0,
+                                    const T& a1, const T& b1, const Ts&... values)
+{
+    return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
+{
+    const vec<T, N> xabs = abs(x);
+    constexpr T div      = constants<T>::fold_constant_div;
+    vec<T, N> y          = floor(xabs / div);
+    quadrant             = innercast<itype<T>>(innercast<int>(y - floor(y * T(1.0 / 16.0)) * T(16.0)));
+
+    const mask<T, N> msk = (quadrant & 1) != 0;
+    quadrant             = kfr::select(msk, quadrant + 1, quadrant);
+    y                    = select(msk, y + T(1.0), y);
+    quadrant             = quadrant & 7;
+
+    constexpr T hi   = constants<T>::fold_constant_hi;
+    constexpr T rem1 = constants<T>::fold_constant_rem1;
+    constexpr T rem2 = constants<T>::fold_constant_rem2;
+    return (xabs - y * hi) - y * rem1 - y * rem2;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> fold_range(const vec<T, N>& x)
+{
+    vec<itype<T>, N> q;
+    return trig_fold(x, q);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask)
+{
+    constexpr f32 sin_c2  = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f);
+    constexpr f32 sin_c4  = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f);
+    constexpr f32 sin_c6  = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f);
+    constexpr f32 sin_c8  = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f);
+    constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f);
+    constexpr f32 cos_c2  = CMT_FP(-0x8.p-4f, -5.0000000000e-01f);
+    constexpr f32 cos_c4  = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f);
+    constexpr f32 cos_c6  = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f);
+    constexpr f32 cos_c8  = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f);
+    constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f);
+
+    const vec<f32, N> x2 = folded * folded;
+
+    vec<f32, N> formula = trig_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6,
+                                      cos_c8, sin_c8, cos_c10, sin_c10);
+
+    formula = select(cosmask, formula, formula * folded);
+    return formula;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask)
+{
+    constexpr f64 sin_c2  = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01);
+    constexpr f64 sin_c4  = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03);
+    constexpr f64 sin_c6  = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04);
+    constexpr f64 sin_c8  = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06);
+    constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08);
+    constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10);
+    constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13);
+    constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14);
+    constexpr f64 cos_c2  = CMT_FP(-0x8.p-4, -5.000000000000000000e-01);
+    constexpr f64 cos_c4  = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02);
+    constexpr f64 cos_c6  = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03);
+    constexpr f64 cos_c8  = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05);
+    constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07);
+    constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09);
+    constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11);
+    constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14);
+
+    vec<f64, N> x2 = folded * folded;
+    vec<f64, N> formula =
+        trig_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8,
+                    cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16);
+
+    formula = select(cosmask, formula, formula * folded);
+    return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
+KFR_INTRINSIC vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask)
+{
+    vec<itype<T>, N> quadrant;
+    vec<T, N> folded = trig_fold(x_full, quadrant);
+
+    mask<T, N> flip_sign =
+        kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask();
+
+    mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
+    usecos            = usecos ^ cosmask;
+
+    vec<T, N> formula = trig_sincos(folded, usecos);
+
+    mask<T, N> negmask = x_full < T(0);
+
+    flip_sign = flip_sign ^ (negmask & ~cosmask);
+
+    formula = select(flip_sign, -formula, formula);
+    return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sin(const vec<T, N>& x)
+{
+    vec<itype<T>, N> quadrant;
+    vec<T, N> folded = trig_fold(x, quadrant);
+
+    mask<T, N> flip_sign = quadrant >= itype<T>(4);
+    mask<T, N> usecos    = (quadrant == itype<T>(2)) || (quadrant == itype<T>(6));
+
+    vec<T, N> formula = trig_sincos(folded, usecos);
+
+    formula = select(flip_sign ^ mask<T, N>(x), -formula, formula);
+    return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> cos(const vec<T, N>& x)
+{
+    vec<itype<T>, N> quadrant;
+    vec<T, N> folded = trig_fold(x, quadrant);
+
+    mask<T, N> eq4       = (quadrant == 4);
+    mask<T, N> flip_sign = (quadrant == 2) || eq4;
+    mask<T, N> usecos    = (quadrant == 0) || eq4;
+
+    vec<T, N> formula = trig_sincos(folded, usecos);
+
+    formula = select(flip_sign, -formula, formula);
+    return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> fastsin(const vec<T, N>& x)
+{
+    const vec<T, N> msk = broadcast<N>(special_constants<T>::highbitmask());
+
+    constexpr static T c2 = -0.16665853559970855712890625;
+    constexpr static T c4 = +8.31427983939647674560546875e-3;
+    constexpr static T c6 = -1.85423981747590005397796630859375e-4;
+
+    const vec<T, N> pi = c_pi<T>;
+
+    vec<T, N> xx = x - pi;
+    vec<T, N> y  = abs(xx);
+    y            = select(y > c_pi<T, 1, 2>, pi - y, y);
+    y            = y ^ (msk & ~xx);
+
+    vec<T, N> y2      = y * y;
+    vec<T, N> formula = c6;
+    vec<T, N> y3      = y2 * y;
+    formula           = fmadd(formula, y2, c4);
+    formula           = fmadd(formula, y2, c2);
+    formula           = formula * y3 + y;
+    return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> fastcos(const vec<T, N>& x)
+{
+    x += c_pi<T, 1, 2>;
+    x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x);
+    return fastsin(x);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sincos(const vec<T, N>& x)
+{
+    return sincos_mask(x, internal::oddmask<T, N>());
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> cossin(const vec<T, N>& x)
+{
+    return sincos_mask(x, internal::evenmask<T, N>());
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sinc(const vec<T, N>& x)
+{
+    return select(abs(x) <= avoid_odr_use(constants<T>::epsilon), T(1), sin(x) / x);
+}
+
+KFR_HANDLE_SCALAR_1_T(sin, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cos, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(fastsin, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(fastcos, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(sincos, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cossin, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(sinc, flt_type<T>)
+
+KFR_HANDLE_NOT_F_1(sin)
+KFR_HANDLE_NOT_F_1(cos)
+KFR_HANDLE_NOT_F_1(fastsin)
+KFR_HANDLE_NOT_F_1(fastcos)
+KFR_HANDLE_NOT_F_1(sincos)
+KFR_HANDLE_NOT_F_1(cossin)
+KFR_HANDLE_NOT_F_1(sinc)
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout sindeg(const T& x)
+{
+    return sin(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout cosdeg(const T& x)
+{
+    return cos(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout fastsindeg(const T& x)
+{
+    return fastsin(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout fastcosdeg(const T& x)
+{
+    return fastcos(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout sincosdeg(const T& x)
+{
+    return sincos(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout cossindeg(const T& x)
+{
+    return cossin(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+} // namespace intrinsics
+
+KFR_I_FN(sin)
+KFR_I_FN(cos)
+KFR_I_FN(fastsin)
+KFR_I_FN(fastcos)
+KFR_I_FN(sincos)
+KFR_I_FN(cossin)
+
+KFR_I_FN(sindeg)
+KFR_I_FN(cosdeg)
+KFR_I_FN(fastsindeg)
+KFR_I_FN(fastcosdeg)
+KFR_I_FN(sincosdeg)
+KFR_I_FN(cossindeg)
+
+KFR_I_FN(sinc)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/sqrt.hpp b/include/kfr/math/impl/sqrt.hpp
@@ -0,0 +1,72 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../simd/impl/function.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32x1 sqrt(const f32x1& x) { return slice<0, 1>(f32x4(_mm_sqrt_ss(extend<4>(x).v))); }
+KFR_INTRINSIC f64x1 sqrt(const f64x1& x)
+{
+    return slice<0, 1>(f64x2(_mm_sqrt_sd(_mm_setzero_pd(), extend<2>(x).v)));
+}
+KFR_INTRINSIC f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(x.v); }
+KFR_INTRINSIC f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(x.v); }
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(x.v); }
+KFR_INTRINSIC f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(x.v); }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 sqrt(const f32avx512& x) { return _mm512_sqrt_ps(x.v); }
+KFR_INTRINSIC f64avx512 sqrt(const f64avx512& x) { return _mm512_sqrt_pd(x.v); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(sqrt, is_f_class<T>::value)
+
+#else
+
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sqrt(const vec<T, N>& x)
+{
+    return apply([](T x) { return std::sqrt(x); }, x);
+}
+#endif
+KFR_HANDLE_SCALAR_1_T(sqrt, flt_type<T>)
+
+KFR_HANDLE_NOT_F_1(sqrt)
+} // namespace intrinsics
+KFR_I_FN(sqrt)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/tan.hpp b/include/kfr/math/impl/tan.hpp
@@ -0,0 +1,149 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/select.hpp"
+#include "../../math/sin_cos.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse)
+{
+    constexpr T pi_14 = c_pi<T, 1, 4>;
+
+    vec<T, N> y      = abs(x_full);
+    vec<T, N> scaled = y / pi_14;
+
+    vec<T, N> k_real = floor(scaled);
+    vec<IT, N> k     = innercast<IT>(k_real);
+
+    vec<T, N> x = y - k_real * pi_14;
+
+    mask<T, N> need_offset = (k & 1) != 0;
+    x                      = select(need_offset, x - pi_14, x);
+
+    vec<IT, N> k_mod4 = k & 3;
+    inverse           = (k_mod4 == 1) || (k_mod4 == 2);
+    return x;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> tan(const vec<f32, N>& x_full)
+{
+    mask<f32, N> inverse;
+    vec<i32, N> quad;
+    const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+    inverse             = quad == 2 || quad == 6;
+
+    constexpr f32 tan_c2  = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01);
+    constexpr f32 tan_c4  = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01);
+    constexpr f32 tan_c6  = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02);
+    constexpr f32 tan_c8  = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02);
+    constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03);
+    constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03);
+
+    constexpr f32 cot_c2  = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01);
+    constexpr f32 cot_c4  = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02);
+    constexpr f32 cot_c6  = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03);
+    constexpr f32 cot_c8  = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04);
+    constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05);
+    constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07);
+
+    const vec<f32, N> x2  = x * x;
+    const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6,
+                                        tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12);
+
+    const vec<f32, N> z = select(inverse, val / -x, val * x);
+    return mulsign(z, x_full);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> tan(const vec<f64, N>& x_full)
+{
+    mask<f64, N> inverse;
+    vec<i64, N> quad;
+    const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+    inverse             = quad == 2 || quad == 6;
+
+    constexpr f64 tan_c2  = 0x1.5555555555a3cp-2;
+    constexpr f64 tan_c4  = 0x1.11111110c4068p-3;
+    constexpr f64 tan_c6  = 0x1.ba1ba1ef36a4dp-5;
+    constexpr f64 tan_c8  = 0x1.664f3f4af7ce2p-6;
+    constexpr f64 tan_c10 = 0x1.226f2682a2616p-7;
+    constexpr f64 tan_c12 = 0x1.d6b440e73f61dp-9;
+    constexpr f64 tan_c14 = 0x1.7f06cdd30bd39p-10;
+    constexpr f64 tan_c16 = 0x1.2a8fab895738ep-11;
+    constexpr f64 tan_c18 = 0x1.34ff88cfdc292p-12;
+    constexpr f64 tan_c20 = -0x1.b4165ea04339fp-18;
+    constexpr f64 tan_c22 = 0x1.5f93701d86962p-13;
+    constexpr f64 tan_c24 = -0x1.5a13a3cdfb8c1p-14;
+    constexpr f64 tan_c26 = 0x1.77c69cef3306cp-15;
+
+    constexpr f64 cot_c2  = -0x1.5555555555555p-2;
+    constexpr f64 cot_c4  = -0x1.6c16c16c16dcdp-6;
+    constexpr f64 cot_c6  = -0x1.1566abbff68a7p-9;
+    constexpr f64 cot_c8  = -0x1.bbd7794ef9999p-13;
+    constexpr f64 cot_c10 = -0x1.66a8ea1991906p-16;
+    constexpr f64 cot_c12 = -0x1.228220068711cp-19;
+    constexpr f64 cot_c14 = -0x1.d65ed2c45e21dp-23;
+    constexpr f64 cot_c16 = -0x1.897ead4a2f71dp-26;
+    constexpr f64 cot_c18 = -0x1.b592dc8656ec9p-31;
+    constexpr f64 cot_c20 = -0x1.3dc07078c46d6p-29;
+    constexpr f64 cot_c22 = 0x1.06c9e5c370edcp-29;
+    constexpr f64 cot_c24 = -0x1.217f50c9dbca3p-30;
+    constexpr f64 cot_c26 = 0x1.163ed8171a0c8p-32;
+
+    const vec<f64, N> x2 = x * x;
+    const vec<f64, N> val =
+        trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6, cot_c8, tan_c8,
+                    cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14, cot_c16, tan_c16, cot_c18, tan_c18,
+                    cot_c20, tan_c20, cot_c22, tan_c22, cot_c24, tan_c24, cot_c26, tan_c26);
+
+    const vec<f64, N> z = select(inverse, val / -x, val * x);
+    return mulsign(z, x_full);
+}
+
+KFR_HANDLE_SCALAR_1_T(tan, flt_type<T>)
+KFR_HANDLE_NOT_F_1(tan)
+
+template <typename T>
+KFR_INTRINSIC flt_type<T> tandeg(const T& x)
+{
+    return tan(x * c_degtorad<flt_type<T>>);
+}
+} // namespace intrinsics
+KFR_I_FN(tan)
+KFR_I_FN(tandeg)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/interpolation.hpp b/include/kfr/math/interpolation.hpp
@@ -0,0 +1,74 @@
+/** @addtogroup interpolation
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "select.hpp"
+#include "sin_cos.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, typename M>
+KFR_FUNCTION T nearest(M mu, T x1, T x2)
+{
+    return select(mu < M(0.5), x1, x2);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T linear(M mu, T x1, T x2)
+{
+    return mix(mu, x1, x2);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T cosine(M mu, T x1, T x2)
+{
+    return mix((M(1) - fastcos(mu * c_pi<T>)) * M(0.5), x1, x2);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T cubic(M mu, T x0, T x1, T x2, T x3)
+{
+    const T a0 = x3 - x2 - x0 + x1;
+    const T a1 = x0 - x1 - a0;
+    const T a2 = x2 - x0;
+    const T a3 = x1;
+    return horner(mu, a0, a1, a2, a3);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T catmullrom(M mu, T x0, T x1, T x2, T x3)
+{
+    const T a0 = T(0.5) * (x3 - x0) - T(1.5) * (x2 - x1);
+    const T a1 = x0 - T(2.5) * x1 + T(2) * x2 - T(0.5) * x3;
+    const T a2 = T(0.5) * (x2 - x0);
+    const T a3 = x1;
+    return horner(mu, a0, a1, a2, a3);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/log_exp.hpp b/include/kfr/math/log_exp.hpp
@@ -0,0 +1,232 @@
+/** @addtogroup exponential
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/log_exp.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns e raised to the given power x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> exp(const T1& x)
+{
+    return intrinsics::exp(x);
+}
+
+/// @brief Returns e raised to the given power x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp, E1> exp(E1&& x)
+{
+    return { fn::exp(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 2 raised to the given power x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> exp2(const T1& x)
+{
+    return intrinsics::exp2(x);
+}
+
+/// @brief Returns 2 raised to the given power x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp2, E1> exp2(E1&& x)
+{
+    return { fn::exp2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 10 raised to the given power x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> exp10(const T1& x)
+{
+    return intrinsics::exp10(x);
+}
+
+/// @brief Returns 10 raised to the given power x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp10, E1> exp10(E1&& x)
+{
+    return { fn::exp10(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the natural logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> log(const T1& x)
+{
+    return intrinsics::log(x);
+}
+
+/// @brief Returns the natural logarithm of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::log, E1> log(E1&& x)
+{
+    return { fn::log(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the binary (base-2) logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> log2(const T1& x)
+{
+    return intrinsics::log2(x);
+}
+
+/// @brief Returns the binary (base-2) logarithm of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::log2, E1> log2(E1&& x)
+{
+    return { fn::log2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the common (base-10) logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> log10(const T1& x)
+{
+    return intrinsics::log10(x);
+}
+
+/// @brief Returns the common (base-10) logarithm of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::log10, E1> log10(E1&& x)
+{
+    return { fn::log10(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the rounded binary (base-2) logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> logb(const T1& x)
+{
+    return intrinsics::logb(x);
+}
+
+/// @brief Returns the rounded binary (base-2) logarithm of the x. Version that accepts and returns
+/// expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::logb, E1> logb(E1&& x)
+{
+    return { fn::logb(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the logarithm of the x with base y.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> logn(const T1& x, const T2& y)
+{
+    return intrinsics::logn(x, y);
+}
+
+/// @brief Returns the logarithm of the x with base y. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::logn, E1, E2> logn(E1&& x, E2&& y)
+{
+    return { fn::logn(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns log(x) * y.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> logm(const T1& x, const T2& y)
+{
+    return intrinsics::logm(x, y);
+}
+
+/// @brief Returns log(x) * y. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::logm, E1, E2> logm(E1&& x, E2&& y)
+{
+    return { fn::logm(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns exp(x * m + a).
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& y, const T3& z)
+{
+    return intrinsics::exp_fmadd(x, y, z);
+}
+
+/// @brief Returns exp(x * m + a). Accepts and returns expressions.
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& y, E3&& z)
+{
+    return { fn::exp_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
+}
+
+/// @brief Returns log(x) * m + a.
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& y, const T3& z)
+{
+    return intrinsics::log_fmadd(x, y, z);
+}
+
+/// @brief Returns log(x) * m + a. Accepts and returns expressions.
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_FUNCTION internal::expression_function<fn::log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& y, E3&& z)
+{
+    return { fn::log_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
+}
+
+/// @brief Returns the x raised to the given power y.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> pow(const T1& x, const T2& y)
+{
+    return intrinsics::pow(x, y);
+}
+
+/// @brief Returns the x raised to the given power y. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::pow, E1, E2> pow(E1&& x, E2&& y)
+{
+    return { fn::pow(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns the real nth root of the x.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> root(const T1& x, const T2& y)
+{
+    return intrinsics::root(x, y);
+}
+
+/// @brief Returns the real nth root of the x. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::root, E1, E2> root(E1&& x, E2&& y)
+{
+    return { fn::root(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns the cube root of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cbrt(const T1& x)
+{
+    return intrinsics::cbrt(x);
+}
+
+/// @brief Returns the cube root of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cbrt, E1> cbrt(E1&& x)
+{
+    return { fn::cbrt(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/logical.hpp b/include/kfr/math/logical.hpp
@@ -0,0 +1,54 @@
+/** @addtogroup logical
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/logical.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns x[0] && x[1] && ... && x[N-1]
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC bool all(const mask<T, N>& x)
+{
+    return intrinsics::bittestall(x.asvec());
+}
+
+/**
+ * @brief Returns x[0] || x[1] || ... || x[N-1]
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC bool any(const mask<T, N>& x)
+{
+    return intrinsics::bittestany(x.asvec());
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/min_max.hpp b/include/kfr/math/min_max.hpp
@@ -0,0 +1,111 @@
+/** @addtogroup basic_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/min_max.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the smaller of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout min(const T1& x, const T2& y)
+{
+    return intrinsics::min(x, y);
+}
+
+/**
+ * @brief Returns the smaller of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::min, E1, E2> min(E1&& x, E2&& y)
+{
+    return { fn::min(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the greater of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout max(const T1& x, const T2& y)
+{
+    return intrinsics::max(x, y);
+}
+
+/**
+ * @brief Returns the greater of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::max, E1, E2> max(E1&& x, E2&& y)
+{
+    return { fn::max(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the smaller in magnitude of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout absmin(const T1& x, const T2& y)
+{
+    return intrinsics::absmin(x, y);
+}
+
+/**
+ * @brief Returns the smaller in magnitude of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::absmin, E1, E2> absmin(E1&& x, E2&& y)
+{
+    return { fn::absmin(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the greater in magnitude of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout absmax(const T1& x, const T2& y)
+{
+    return intrinsics::absmax(x, y);
+}
+
+/**
+ * @brief Returns the greater in magnitude of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::absmax, E1, E2> absmax(E1&& x, E2&& y)
+{
+    return { fn::absmax(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/modzerobessel.hpp b/include/kfr/math/modzerobessel.hpp
@@ -0,0 +1,47 @@
+/** @addtogroup other_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/modzerobessel.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 modzerobessel(const T1& x)
+{
+    return intrinsics::modzerobessel(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1&& x)
+{
+    return { fn::modzerobessel(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/round.hpp b/include/kfr/math/round.hpp
@@ -0,0 +1,163 @@
+/** @addtogroup round
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/round.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the largest integer value not greater than x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 floor(const T1& x)
+{
+    return intrinsics::floor(x);
+}
+
+/// @brief Returns the largest integer value not greater than x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::floor, E1> floor(E1&& x)
+{
+    return { fn::floor(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 ceil(const T1& x)
+{
+    return intrinsics::ceil(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::ceil, E1> ceil(E1&& x)
+{
+    return { fn::ceil(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 round(const T1& x)
+{
+    return intrinsics::round(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::round, E1> round(E1&& x)
+{
+    return { fn::round(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 trunc(const T1& x)
+{
+    return intrinsics::trunc(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::trunc, E1> trunc(E1&& x)
+{
+    return { fn::trunc(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 fract(const T1& x)
+{
+    return intrinsics::fract(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::fract, E1> fract(E1&& x)
+{
+    return { fn::fract(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> ifloor(const T1& x)
+{
+    return intrinsics::ifloor(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::ifloor, E1> ifloor(E1&& x)
+{
+    return { fn::ifloor(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> iceil(const T1& x)
+{
+    return intrinsics::iceil(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::iceil, E1> iceil(E1&& x)
+{
+    return { fn::iceil(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> iround(const T1& x)
+{
+    return intrinsics::iround(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::iround, E1> iround(E1&& x)
+{
+    return { fn::iround(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> itrunc(const T1& x)
+{
+    return intrinsics::itrunc(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::itrunc, E1> itrunc(E1&& x)
+{
+    return { fn::itrunc(), std::forward<E1>(x) };
+}
+
+template <typename T, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC T fmod(const T& x, const T& y)
+{
+    return x - trunc(x / y) * y;
+}
+KFR_FN(fmod)
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+constexpr KFR_INTRINSIC vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x % y;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return fmod(x, y);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/saturation.hpp b/include/kfr/math/saturation.hpp
@@ -0,0 +1,65 @@
+/** @addtogroup saturation
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/saturation.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Adds two arguments using saturation
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout satadd(const T1& x, const T2& y)
+{
+    return intrinsics::satadd(x, y);
+}
+
+/// @brief Creates an expression that adds two arguments using saturation
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::satadd, E1, E2> satadd(E1&& x, E2&& y)
+{
+    return { fn::satadd(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Subtracts two arguments using saturation
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+          typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout satsub(const T1& x, const T2& y)
+{
+    return intrinsics::satsub(x, y);
+}
+
+/// @brief Creates an expression that subtracts two arguments using saturation
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::satsub, E1, E2> satsub(E1&& x, E2&& y)
+{
+    return { fn::satsub(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/select.hpp b/include/kfr/math/select.hpp
@@ -0,0 +1,59 @@
+/** @addtogroup basic_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/select.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns x if m is true, otherwise return y. Order of the arguments is same as in ternary operator.
+ * @code
+ * return m ? x : y
+ * @endcode
+ */
+template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
+          typename Tout = subtype<common_type<T2, T3>>>
+KFR_INTRINSIC vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
+{
+    static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
+    return intrinsics::select(bitcast<Tout>(m.asvec()), innercast<Tout>(x), innercast<Tout>(y));
+}
+
+/**
+ * @brief Returns template expression that returns x if m is true, otherwise return y. Order of the arguments
+ * is same as in ternary operator.
+ */
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::select, E1, E2, E3> select(E1&& m, E2&& x, E3&& y)
+{
+    return { fn::select(), std::forward<E1>(m), std::forward<E2>(x), std::forward<E3>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/sin_cos.hpp b/include/kfr/math/sin_cos.hpp
@@ -0,0 +1,318 @@
+/** @addtogroup trigonometric
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/sin_cos.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the trigonometric sine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sin(const T1& x)
+{
+    return intrinsics::sin(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sin, E1> sin(E1&& x)
+{
+    return { fn::sin(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cos(const T1& x)
+{
+    return intrinsics::cos(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cos, E1> cos(E1&& x)
+{
+    return { fn::cos(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastsin(const T1& x)
+{
+    return intrinsics::fastsin(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastsin, E1> fastsin(E1&& x)
+{
+    return { fn::fastsin(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastcos(const T1& x)
+{
+    return intrinsics::fastcos(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastcos, E1> fastcos(E1&& x)
+{
+    return { fn::fastcos(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
+ * be a vector.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sincos(const T1& x)
+{
+    return intrinsics::sincos(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and
+ * cosine of the odd elements. x must be a vector. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sincos, E1> sincos(E1&& x)
+{
+    return { fn::sincos(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
+ * be a vector.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cossin(const T1& x)
+{
+    return intrinsics::cossin(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and
+ * sine of the odd elements. x must be a vector. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cossin, E1> cossin(E1&& x)
+{
+    return { fn::cossin(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sindeg(const T1& x)
+{
+    return intrinsics::sindeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of the x (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sindeg, E1> sindeg(E1&& x)
+{
+    return { fn::sindeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cosdeg(const T1& x)
+{
+    return intrinsics::cosdeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the x (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cosdeg, E1> cosdeg(E1&& x)
+{
+    return { fn::cosdeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastsindeg(const T1& x)
+{
+    return intrinsics::fastsindeg(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of the x
+ * (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastsindeg, E1> fastsindeg(E1&& x)
+{
+    return { fn::fastsindeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastcosdeg(const T1& x)
+{
+    return intrinsics::fastcosdeg(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of the x
+ * (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastcosdeg, E1> fastcosdeg(E1&& x)
+{
+    return { fn::fastcosdeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
+ * be a vector and expressed in degrees.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sincosdeg(const T1& x)
+{
+    return intrinsics::sincosdeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and
+ * cosine of the odd elements. x must be expressed in degrees. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sincosdeg, E1> sincosdeg(E1&& x)
+{
+    return { fn::sincosdeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
+ * be a vector and expressed in degrees.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cossindeg(const T1& x)
+{
+    return intrinsics::cossindeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and
+ * sine of the odd elements. x must be expressed in degrees. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cossindeg, E1> cossindeg(E1&& x)
+{
+    return { fn::cossindeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the sinc function of x.
+ * \f[
+ * sinc(x) = \frac{sin(x)}{x}
+ * \f]
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sinc(const T1& x)
+{
+    return intrinsics::sinc(x);
+}
+
+/**
+ * @brief Returns the sinc function of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sinc, E1> sinc(E1&& x)
+{
+    return { fn::sinc(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the angle 2x using sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T sin2x(const T& sinx, const T& cosx)
+{
+    return 2 * sinx * cosx;
+}
+
+/**
+ * @brief Returns the trigonometric sine of the angle 3x using already computed sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T sin3x(const T& sinx, const T& cosx)
+{
+    return sinx * (-1 + 4 * sqr(cosx));
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the angle 2x using already computed sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T cos2x(const T& sinx, const T& cosx)
+{
+    return sqr(cosx) - sqr(sinx);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the angle 3x using already computed sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T cos3x(const T& sinx, const T& cosx)
+{
+    return cosx * (1 - 4 * sqr(sinx));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/sqrt.hpp b/include/kfr/math/sqrt.hpp
@@ -0,0 +1,53 @@
+/** @addtogroup basic_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/sqrt.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the positive square root of the x. \f$\sqrt{x}\f$
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC flt_type<T1> sqrt(const T1& x)
+{
+    return intrinsics::sqrt(x);
+}
+
+/**
+ * @brief Returns template expression that returns the positive square root of the x. \f$\sqrt{x}\f$
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sqrt, E1> sqrt(E1&& x)
+{
+    return { fn::sqrt(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/tan.hpp b/include/kfr/math/tan.hpp
@@ -0,0 +1,59 @@
+/** @addtogroup trigonometric
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/tan.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> tan(const T1& x)
+{
+    return intrinsics::tan(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::tan, E1> tan(E1&& x)
+{
+    return { fn::tan(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> tandeg(const T1& x)
+{
+    return intrinsics::tandeg(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::tandeg, E1> tandeg(E1&& x)
+{
+    return { fn::tandeg(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/runtime.hpp b/include/kfr/runtime.hpp
@@ -0,0 +1,26 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "runtime/cpuid.hpp"
+#include "runtime/cpuid_auto.hpp"
diff --git a/include/kfr/runtime/cpuid.hpp b/include/kfr/runtime/cpuid.hpp
@@ -0,0 +1,300 @@
+/** @addtogroup cpuid
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include "../simd/platform.hpp"
+#include "../simd/types.hpp"
+#include <cstring>
+
+namespace kfr
+{
+#ifdef CMT_ARCH_X86
+
+struct cpu_features
+{
+    u32 max;
+    u32 exmax;
+    u32 isIntel : 1;
+    u32 isAMD : 1;
+    u32 has3DNOW : 1;
+    u32 has3DNOWEXT : 1;
+    u32 hasABM : 1;
+    u32 hasADX : 1;
+    u32 hasAES : 1;
+    u32 hasAVX : 1;
+    u32 hasAVX2 : 1;
+    u32 hasAVXOSSUPPORT : 1;
+    u32 hasAVX512OSSUPPORT : 1;
+    u32 hasAVX512CD : 1;
+    u32 hasAVX512ER : 1;
+    u32 hasAVX512F : 1;
+    u32 hasAVX512DQ : 1;
+    u32 hasAVX512PF : 1;
+    u32 hasAVX512BW : 1;
+    u32 hasAVX512VL : 1;
+    u32 hasBMI1 : 1;
+    u32 hasBMI2 : 1;
+    u32 hasCLFSH : 1;
+    u32 hasCMOV : 1;
+    u32 hasCMPXCHG16B : 1;
+    u32 hasCX8 : 1;
+    u32 hasERMS : 1;
+    u32 hasF16C : 1;
+    u32 hasFMA : 1;
+    u32 hasFSGSBASE : 1;
+    u32 hasFXSR : 1;
+    u32 hasHLE : 1;
+    u32 hasINVPCID : 1;
+    u32 hasLAHF : 1;
+    u32 hasLZCNT : 1;
+    u32 hasMMX : 1;
+    u32 hasMMXEXT : 1;
+    u32 hasMONITOR : 1;
+    u32 hasMOVBE : 1;
+    u32 hasMSR : 1;
+    u32 hasOSXSAVE : 1;
+    u32 hasPCLMULQDQ : 1;
+    u32 hasPOPCNT : 1;
+    u32 hasPREFETCHWT1 : 1;
+    u32 hasRDRAND : 1;
+    u32 hasRDSEED : 1;
+    u32 hasRDTSCP : 1;
+    u32 hasRTM : 1;
+    u32 hasSEP : 1;
+    u32 hasSHA : 1;
+    u32 hasSSE : 1;
+    u32 hasSSE2 : 1;
+    u32 hasSSE3 : 1;
+    u32 hasSSE41 : 1;
+    u32 hasSSE42 : 1;
+    u32 hasSSE4a : 1;
+    u32 hasSSSE3 : 1;
+    u32 hasSYSCALL : 1;
+    u32 hasTBM : 1;
+    u32 hasXOP : 1;
+    u32 hasXSAVE : 1;
+    u32 padding1 : 6;
+    char vendor[17];
+    char model[49];
+    char padding2[2];
+};
+
+namespace internal_generic
+{
+
+struct cpu_data
+{
+    u32 data[4];
+};
+
+#if defined CMT_COMPILER_GNU || defined CMT_COMPILER_CLANG
+KFR_INTRINSIC u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx)
+{
+    __asm__("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(func), "2"(subfunc));
+    return 1;
+}
+KFR_INTRINSIC void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
+{
+    get_cpuid(func, subfunc, &ptr[0], &ptr[1], &ptr[2], &ptr[3]);
+}
+KFR_INTRINSIC u32 get_xcr0()
+{
+    u32 xcr0;
+    __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
+    return xcr0;
+}
+#elif defined CMT_COMPILER_MSVC
+
+KFR_INTRINSIC void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
+{
+    __cpuidex((int*)ptr, (int)func, (int)subfunc);
+}
+KFR_INTRINSIC u32 get_xcr0()
+{
+#ifdef _XCR_XFEATURE_ENABLED_MASK
+    unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (u32)Result;
+#else
+    return 0;
+#endif
+}
+#endif
+
+template <size_t = 0>
+cpu_t detect_cpu()
+{
+    cpu_features c;
+    memset(&c, 0, sizeof(c));
+    cpu_data data0;
+    cpu_data exdata0;
+
+    u32 f_1_ECX(0);
+    u32 f_1_EDX(0);
+    u32 f_7_EBX(0);
+    u32 f_7_ECX(0);
+    u32 f_81_ECX(0);
+    u32 f_81_EDX(0);
+
+    cpuid(data0.data, 0);
+    c.max = static_cast<u32>(data0.data[0]);
+    cpuid(exdata0.data, 0x80000000);
+    c.exmax = static_cast<u32>(exdata0.data[0]);
+
+    *ptr_cast<u32>(c.vendor)     = static_cast<u32>(data0.data[1]);
+    *ptr_cast<u32>(c.vendor + 4) = static_cast<u32>(data0.data[3]);
+    *ptr_cast<u32>(c.vendor + 8) = static_cast<u32>(data0.data[2]);
+
+    c.isIntel = strncmp(c.vendor, "GenuineIntel", sizeof(c.vendor)) == 0 ? 1 : 0;
+    c.isAMD   = strncmp(c.vendor, "AuthenticAMD", sizeof(c.vendor)) == 0 ? 1 : 0;
+
+    if (c.max >= 1)
+    {
+        cpu_data data1;
+        cpuid(data1.data, 1);
+        f_1_ECX = static_cast<u32>(data1.data[2]);
+        f_1_EDX = static_cast<u32>(data1.data[3]);
+    }
+
+    if (c.max >= 7)
+    {
+        cpu_data data7;
+        cpuid(data7.data, 7);
+        f_7_EBX = static_cast<u32>(data7.data[1]);
+        f_7_ECX = static_cast<u32>(data7.data[2]);
+    }
+
+    if (c.exmax >= 0x80000001)
+    {
+        cpu_data data81;
+        cpuid(data81.data, 0x80000001);
+        f_81_ECX = static_cast<u32>(data81.data[2]);
+        f_81_EDX = static_cast<u32>(data81.data[3]);
+    }
+
+    if (c.exmax >= 0x80000004)
+    {
+        cpu_data data82;
+        cpu_data data83;
+        cpu_data data84;
+        cpuid(data82.data, 0x80000002);
+        cpuid(data83.data, 0x80000003);
+        cpuid(data84.data, 0x80000004);
+        memcpy(c.model, data82.data, sizeof(cpu_data));
+        memcpy(c.model + 16, data83.data, sizeof(cpu_data));
+        memcpy(c.model + 32, data84.data, sizeof(cpu_data));
+    }
+
+    c.hasSSE3        = f_1_ECX >> 0 & 1;
+    c.hasPCLMULQDQ   = f_1_ECX >> 1 & 1;
+    c.hasMONITOR     = f_1_ECX >> 3 & 1;
+    c.hasSSSE3       = f_1_ECX >> 9 & 1;
+    c.hasFMA         = f_1_ECX >> 12 & 1;
+    c.hasCMPXCHG16B  = f_1_ECX >> 13 & 1;
+    c.hasSSE41       = f_1_ECX >> 19 & 1;
+    c.hasSSE42       = f_1_ECX >> 20 & 1;
+    c.hasMOVBE       = f_1_ECX >> 22 & 1;
+    c.hasPOPCNT      = f_1_ECX >> 23 & 1;
+    c.hasAES         = f_1_ECX >> 25 & 1;
+    c.hasXSAVE       = f_1_ECX >> 26 & 1;
+    c.hasOSXSAVE     = f_1_ECX >> 27 & 1;
+    c.hasAVX         = f_1_ECX >> 28 & 1;
+    c.hasF16C        = f_1_ECX >> 29 & 1;
+    c.hasRDRAND      = f_1_ECX >> 30 & 1;
+    c.hasMSR         = f_1_EDX >> 5 & 1;
+    c.hasCX8         = f_1_EDX >> 8 & 1;
+    c.hasSEP         = f_1_EDX >> 11 & 1;
+    c.hasCMOV        = f_1_EDX >> 15 & 1;
+    c.hasCLFSH       = f_1_EDX >> 19 & 1;
+    c.hasMMX         = f_1_EDX >> 23 & 1;
+    c.hasFXSR        = f_1_EDX >> 24 & 1;
+    c.hasSSE         = f_1_EDX >> 25 & 1;
+    c.hasSSE2        = f_1_EDX >> 26 & 1;
+    c.hasFSGSBASE    = f_7_EBX >> 0 & 1;
+    c.hasBMI1        = f_7_EBX >> 3 & 1;
+    c.hasHLE         = c.isIntel && f_7_EBX >> 4 & 1;
+    c.hasAVX2        = f_7_EBX >> 5 & 1;
+    c.hasBMI2        = f_7_EBX >> 8 & 1;
+    c.hasERMS        = f_7_EBX >> 9 & 1;
+    c.hasINVPCID     = f_7_EBX >> 10 & 1;
+    c.hasRTM         = c.isIntel && f_7_EBX >> 11 & 1;
+    c.hasAVX512F     = f_7_EBX >> 16 & 1;
+    c.hasAVX512DQ    = f_7_EBX >> 17 & 1;
+    c.hasRDSEED      = f_7_EBX >> 18 & 1;
+    c.hasADX         = f_7_EBX >> 19 & 1;
+    c.hasAVX512PF    = f_7_EBX >> 26 & 1;
+    c.hasAVX512ER    = f_7_EBX >> 27 & 1;
+    c.hasAVX512CD    = f_7_EBX >> 28 & 1;
+    c.hasSHA         = f_7_EBX >> 29 & 1;
+    c.hasAVX512BW    = f_7_EBX >> 30 & 1;
+    c.hasAVX512VL    = f_7_EBX >> 31 & 1;
+    c.hasPREFETCHWT1 = f_7_ECX >> 0 & 1;
+    c.hasLAHF        = f_81_ECX >> 0 & 1;
+    c.hasLZCNT       = c.isIntel && f_81_ECX >> 5 & 1;
+    c.hasABM         = c.isAMD && f_81_ECX >> 5 & 1;
+    c.hasSSE4a       = c.isAMD && f_81_ECX >> 6 & 1;
+    c.hasXOP         = c.isAMD && f_81_ECX >> 11 & 1;
+    c.hasTBM         = c.isAMD && f_81_ECX >> 21 & 1;
+    c.hasSYSCALL     = c.isIntel && f_81_EDX >> 11 & 1;
+    c.hasMMXEXT      = c.isAMD && f_81_EDX >> 22 & 1;
+    c.hasRDTSCP      = c.isIntel && f_81_EDX >> 27 & 1;
+    c.has3DNOWEXT    = c.isAMD && f_81_EDX >> 30 & 1;
+    c.has3DNOW       = c.isAMD && f_81_EDX >> 31 & 1;
+
+    c.hasAVXOSSUPPORT    = c.hasAVX && c.hasOSXSAVE && (get_xcr0() & 0x06) == 0x06;
+    c.hasAVX512OSSUPPORT = c.hasAVXOSSUPPORT && c.hasAVX512F && c.hasOSXSAVE && (get_xcr0() & 0xE0) == 0xE0;
+
+    if (c.hasAVX512F && c.hasAVX512CD && c.hasAVX512VL && c.hasAVX512BW && c.hasAVX512DQ &&
+        c.hasAVX512OSSUPPORT)
+        return cpu_t::avx512;
+    if (c.hasAVX2 && c.hasAVXOSSUPPORT)
+        return cpu_t::avx2;
+    if (c.hasAVX && c.hasAVXOSSUPPORT)
+        return cpu_t::avx1;
+    if (c.hasSSE41)
+        return cpu_t::sse41;
+    if (c.hasSSSE3)
+        return cpu_t::ssse3;
+    if (c.hasSSE3)
+        return cpu_t::sse3;
+    if (c.hasSSE2)
+        return cpu_t::sse2;
+    return cpu_t::lowest;
+}
+} // namespace internal_generic
+#else
+
+template <size_t = 0>
+cpu_t detect_cpu()
+{
+    return cpu_t::native;
+}
+
+#endif
+} // namespace kfr
diff --git a/include/kfr/runtime/cpuid_auto.hpp b/include/kfr/runtime/cpuid_auto.hpp
@@ -0,0 +1,62 @@
+/** @addtogroup cpuid
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "cpuid.hpp"
+
+namespace kfr
+{
+
+namespace internal_generic
+{
+
+KFR_INTRINSIC cpu_t& cpu_v()
+{
+    static cpu_t v1 = cpu_t::native;
+    return v1;
+}
+
+KFR_INTRINSIC char init_cpu_v()
+{
+    cpu_v() = detect_cpu<0>();
+    return 0;
+}
+
+KFR_INTRINSIC char init_dummyvar()
+{
+    static char dummy = init_cpu_v();
+    return dummy;
+}
+
+static char dummyvar = init_dummyvar();
+} // namespace internal_generic
+
+/**
+ * @brief Returns cpu instruction set detected at runtime.
+ */
+KFR_FUNCTION cpu_t get_cpu() { return internal_generic::cpu_v(); }
+
+} // namespace kfr
diff --git a/include/kfr/simd.hpp b/include/kfr/simd.hpp
@@ -0,0 +1,36 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd/comparison.hpp"
+#include "simd/complex.hpp"
+#include "simd/constants.hpp"
+#include "simd/digitreverse.hpp"
+#include "simd/horizontal.hpp"
+#include "simd/mask.hpp"
+#include "simd/operators.hpp"
+#include "simd/platform.hpp"
+#include "simd/read_write.hpp"
+#include "simd/shuffle.hpp"
+#include "simd/types.hpp"
+#include "simd/vec.hpp"
diff --git a/include/kfr/simd/comparison.hpp b/include/kfr/simd/comparison.hpp
@@ -0,0 +1,152 @@
+/** @addtogroup logical
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "constants.hpp"
+#include "impl/function.hpp"
+#include "vec.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> equal(const T1& x, const T2& y)
+{
+    return x == y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> notequal(const T1& x, const T2& y)
+{
+    return x != y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> less(const T1& x, const T2& y)
+{
+    return x < y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> greater(const T1& x, const T2& y)
+{
+    return x > y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> lessorequal(const T1& x, const T2& y)
+{
+    return x <= y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> greaterorequal(const T1& x, const T2& y)
+{
+    return x >= y;
+}
+KFR_FN(equal)
+KFR_FN(notequal)
+KFR_FN(less)
+KFR_FN(greater)
+KFR_FN(lessorequal)
+KFR_FN(greaterorequal)
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::equal, E1, E2> operator==(E1&& e1, E2&& e2)
+{
+    return { fn::equal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::notequal, E1, E2> operator!=(E1&& e1, E2&& e2)
+{
+    return { fn::notequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::less, E1, E2> operator<(E1&& e1, E2&& e2)
+{
+    return { fn::less(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::greater, E1, E2> operator>(E1&& e1, E2&& e2)
+{
+    return { fn::greater(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::lessorequal, E1, E2> operator<=(E1&& e1, E2&& e2)
+{
+    return { fn::lessorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::greaterorequal, E1, E2> operator>=(E1&& e1, E2&& e2)
+{
+    return { fn::greaterorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isnan(const vec<T, N>& x)
+{
+    return x != x;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isinf(const vec<T, N>& x)
+{
+    return x == avoid_odr_use(constants<T>::infinity) || x == -constants<T>::infinity;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isfinite(const vec<T, N>& x)
+{
+    return !isnan(x) && !isinf(x);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isnegative(const vec<T, N>& x)
+{
+    return (x & constants<T>::highbitmask()) != 0;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> ispositive(const vec<T, N>& x)
+{
+    return !isnegative(x);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> iszero(const vec<T, N>& x)
+{
+    return x == T();
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC maskfor<common_type<T1, T2, T3>> inrange(const T1& x, const T2& min, const T3& max)
+{
+    return x >= min && x <= max;
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/complex.hpp b/include/kfr/simd/complex.hpp
@@ -0,0 +1,468 @@
+/** @addtogroup complex
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "constants.hpp"
+#include "impl/function.hpp"
+#include "operators.hpp"
+
+#ifdef KFR_STD_COMPLEX
+#include <complex>
+#endif
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4814))
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+#ifdef KFR_STD_COMPLEX
+
+template <typename T>
+using complex = std::complex<T>;
+
+#else
+#ifndef KFR_CUSTOM_COMPLEX
+
+/**
+ * @brief Represents the complex numbers. If KFR_STD_COMPLEX is defined, then kfr::complex is an alias for
+ * std::complex.
+ */
+template <typename T>
+struct complex
+{
+    static_assert(is_simd_type<T>::value, "Incorrect type for complex");
+    constexpr static bool is_pod     = true;
+    constexpr complex() CMT_NOEXCEPT = default;
+    KFR_MEM_INTRINSIC constexpr complex(T re) CMT_NOEXCEPT : re(re), im(0) {}
+    KFR_MEM_INTRINSIC constexpr complex(T re, T im) CMT_NOEXCEPT : re(re), im(im) {}
+    constexpr complex(const complex&) CMT_NOEXCEPT = default;
+    constexpr complex(complex&&) CMT_NOEXCEPT      = default;
+    template <typename U>
+    KFR_MEM_INTRINSIC constexpr complex(const complex<U>& other) CMT_NOEXCEPT : re(static_cast<T>(other.re)),
+                                                                                im(static_cast<T>(other.im))
+    {
+    }
+    template <typename U>
+    KFR_MEM_INTRINSIC constexpr complex(complex<U>&& other) CMT_NOEXCEPT : re(std::move(other.re)),
+                                                                           im(std::move(other.im))
+    {
+    }
+#ifdef CMT_COMPILER_GNU
+    constexpr complex& operator=(const complex&) CMT_NOEXCEPT = default;
+    constexpr complex& operator=(complex&&) CMT_NOEXCEPT = default;
+#else
+    complex& operator=(const complex&) = default;
+    complex& operator=(complex&&) = default;
+#endif
+    KFR_MEM_INTRINSIC constexpr const T& real() const CMT_NOEXCEPT { return re; }
+    KFR_MEM_INTRINSIC constexpr const T& imag() const CMT_NOEXCEPT { return im; }
+    KFR_MEM_INTRINSIC constexpr void real(T value) CMT_NOEXCEPT { re = value; }
+    KFR_MEM_INTRINSIC constexpr void imag(T value) CMT_NOEXCEPT { im = value; }
+    T re;
+    T im;
+
+    KFR_MEM_INTRINSIC friend complex operator+(const complex& x, const complex& y)
+    {
+        return (make_vector(x) + make_vector(y))[0];
+    }
+    KFR_MEM_INTRINSIC friend complex operator-(const complex& x, const complex& y)
+    {
+        return (make_vector(x) - make_vector(y))[0];
+    }
+    KFR_MEM_INTRINSIC friend complex operator*(const complex& x, const complex& y)
+    {
+        return (make_vector(x) * make_vector(y))[0];
+    }
+    KFR_MEM_INTRINSIC friend complex operator/(const complex& x, const complex& y)
+    {
+        return (make_vector(x) / make_vector(y))[0];
+    }
+
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator+(const complex& x, const U& y)
+    {
+        return static_cast<C>(x) + static_cast<C>(y);
+    }
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator-(const complex& x, const U& y)
+    {
+        return static_cast<C>(x) - static_cast<C>(y);
+    }
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator*(const complex& x, const U& y)
+    {
+        return static_cast<C>(x) * static_cast<C>(y);
+    }
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator/(const complex& x, const U& y)
+    {
+        return static_cast<C>(x) / static_cast<C>(y);
+    }
+
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator+(const U& x, const complex& y)
+    {
+        return static_cast<C>(x) + static_cast<C>(y);
+    }
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator-(const U& x, const complex& y)
+    {
+        return static_cast<C>(x) - static_cast<C>(y);
+    }
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator*(const U& x, const complex& y)
+    {
+        return static_cast<C>(x) * static_cast<C>(y);
+    }
+    template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+    KFR_MEM_INTRINSIC friend C operator/(const U& x, const complex& y)
+    {
+        return static_cast<C>(x) / static_cast<C>(y);
+    }
+    KFR_MEM_INTRINSIC friend complex operator-(const complex& x) { return (-make_vector(x))[0]; }
+    KFR_MEM_INTRINSIC friend complex operator+(const complex& x) { return x; }
+};
+#endif
+#endif
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+namespace cometa
+{
+template <typename T>
+struct compound_type_traits<kfr::complex<T>>
+{
+    constexpr static size_t width      = 2;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
+    template <typename U>
+    using rebind = kfr::complex<U>;
+    template <typename U>
+    using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>;
+
+    static constexpr subtype at(const kfr::complex<T>& value, size_t index)
+    {
+        return index == 0 ? value.real() : value.imag();
+    }
+};
+} // namespace cometa
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Alias for complex<f32>
+using c32 = complex<f32>;
+
+/// @brief Alias for complex<f64>
+using c64 = complex<f64>;
+
+/// @brief Alias for complex<fbase>
+using cbase = complex<fbase>;
+
+namespace intrinsics
+{
+template <typename T>
+constexpr inline complex<T> vcomplex(const vec<T, 2>& v)
+{
+    return complex<T>(v.front(), v.back());
+}
+template <typename T>
+constexpr inline vec<T, 2> vcomplex(const complex<T>& v)
+{
+    return vec<T, 2>(v.real(), v.imag());
+}
+template <typename T>
+constexpr inline simd<T, 2> vvcomplex(const complex<T>& v)
+{
+    return intrinsics::simd_make(ctype<T>, v.real(), v.imag());
+}
+} // namespace intrinsics
+
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevector(const vec<complex<T>, N>& x,
+                                                                csizes_t<indices...>) CMT_NOEXCEPT
+{
+    return intrinsics::simd_shuffle(intrinsics::simd_t<T, N>{}, x.v, scale<2, indices...>(), overload_auto);
+}
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevectors(const vec<complex<T>, N>& x,
+                                                                 const vec<T, N>& y,
+                                                                 csizes_t<indices...>) CMT_NOEXCEPT
+{
+    return intrinsics::simd_shuffle(intrinsics::simd2_t<T, N, N>{}, x.v, y.v, scale<2, indices...>(),
+                                    overload_auto);
+}
+namespace internal
+{
+template <typename T>
+struct compoundcast<complex<T>>
+{
+    static vec<T, 2> to_flat(const complex<T>& x) { return { x.real(), x.imag() }; }
+    static complex<T> from_flat(const vec<T, 2>& x) { return { x.front(), x.back() }; }
+};
+
+template <typename T, size_t N>
+struct compoundcast<vec<complex<T>, N>>
+{
+    static vec<T, N * 2> to_flat(const vec<complex<T>, N>& x) { return x.flatten(); }
+    static vec<complex<T>, N / 2> from_flat(const vec<T, N>& x)
+    {
+        return vec<complex<T>, N / 2>::from_flatten(x);
+    }
+};
+} // namespace internal
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<complex<T>, N / 2> ccomp(const vec<T, N>& x)
+{
+    return vec<complex<T>, N / 2>::from_flatten(x);
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N * 2> cdecom(const vec<complex<T>, N>& x)
+{
+    return x.flatten();
+}
+
+/// @brief Returns vector of complex values with real part duplicated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x)
+{
+    return ccomp(dupeven(cdecom(x)));
+}
+KFR_FN(cdupreal)
+
+/// @brief Returns vector of complex values with imaginary part duplicated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x)
+{
+    return ccomp(dupodd(cdecom(x)));
+}
+KFR_FN(cdupimag)
+
+/// @brief Returns vector of complex values with real and imaginary parts swapped
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x)
+{
+    return ccomp(swap<2>(cdecom(x)));
+}
+KFR_FN(cswapreim)
+
+/// @brief Returns vector of complex values with real part negated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x)
+{
+    return x ^ complex<T>(-T(), T());
+}
+KFR_FN(cnegreal)
+
+/// @brief Returns vector of complex values with imaginary part negated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x)
+{
+    return x ^ complex<T>(T(), -T());
+}
+KFR_FN(cnegimag)
+
+namespace internal
+{
+template <typename T>
+struct is_complex_impl : std::false_type
+{
+};
+template <typename T>
+struct is_complex_impl<complex<T>> : std::true_type
+{
+};
+
+// vector<complex> to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<complex<From>, N>>
+{
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<complex<To>, N> cast(const vec<complex<From>, N>& value)
+    {
+        return vec<To, N * 2>(value.flatten()).v;
+    }
+};
+
+// vector to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<From, N>>
+{
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<complex<To>, N> cast(const vec<From, N>& value)
+    {
+        const vec<To, N> casted = static_cast<vec<To, N>>(value);
+        return interleave(casted, zerovector(casted)).v;
+    }
+};
+
+} // namespace internal
+
+/// @brief Returns the real part of the complex value
+template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)>
+constexpr KFR_INTRINSIC T real(const T& value)
+{
+    return value;
+}
+
+/// @brief Returns the real part of the complex value
+template <typename T>
+constexpr KFR_INTRINSIC T real(const complex<T>& value)
+{
+    return value.real();
+}
+
+/// @brief Returns the real part of the complex value
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> real(const vec<complex<T>, N>& value)
+{
+    return even(cdecom(value));
+}
+
+template <typename T>
+using realtype = decltype(kfr::real(std::declval<T>()));
+template <typename T>
+using realftype = ftype<decltype(kfr::real(std::declval<T>()))>;
+
+KFR_FN(real)
+
+/// @brief Returns the real part of the complex value
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::real, E1> real(E1&& x)
+{
+    return { {}, std::forward<E1>(x) };
+}
+
+/// @brief Returns the imaginary part of the complex value
+template <typename T>
+constexpr KFR_INTRINSIC T imag(const complex<T>& value)
+{
+    return value.imag();
+}
+
+/// @brief Returns the imaginary part of the complex value
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> imag(const vec<complex<T>, N>& value)
+{
+    return odd(cdecom(value));
+}
+KFR_FN(imag)
+
+/// @brief Returns the imaginary part of the complex value
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::imag, E1> imag(E1&& x)
+{
+    return { {}, std::forward<E1>(x) };
+}
+
+/// @brief Constructs complex value from real and imaginary parts
+template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>>
+constexpr KFR_INTRINSIC vec<complex<T>, N> make_complex(const vec<T1, N>& real,
+                                                        const vec<T2, N>& imag = T2(0))
+{
+    return ccomp(interleave(innercast<T>(real), innercast<T>(imag)));
+}
+
+/// @brief Constructs complex value from real and imaginary parts
+template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>>
+constexpr KFR_INTRINSIC complex<T> make_complex(T1 real, T2 imag = T2(0))
+{
+    return complex<T>(innercast<T>(real), innercast<T>(imag));
+}
+
+namespace intrinsics
+{
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cconj(const vec<complex<T>, N>& x)
+{
+    return cnegimag(x);
+}
+} // namespace intrinsics
+
+/// @brief Returns the complex conjugate of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 cconj(const T1& x)
+{
+    return intrinsics::cconj(x);
+}
+
+template <size_t N>
+struct vec_of_complex
+{
+    template <typename T>
+    using type = vec<complex<T>, N>;
+};
+} // namespace CMT_ARCH_NAME
+
+template <typename T1, typename T2>
+struct common_type_impl<kfr::complex<T1>, kfr::complex<T2>> : common_type_from_subtypes<T1, T2, kfr::complex>
+{
+};
+template <typename T1, typename T2>
+struct common_type_impl<kfr::complex<T1>, T2> : common_type_from_subtypes<T1, T2, kfr::complex>
+{
+};
+template <typename T1, typename T2>
+struct common_type_impl<T1, kfr::complex<T2>> : common_type_from_subtypes<T1, T2, kfr::complex>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<kfr::complex<T1>, N>, kfr::vec<kfr::complex<T2>, N>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::complex<T1>, kfr::vec<T2, N>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<T1, N>, kfr::complex<T2>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+} // namespace kfr
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/simd/constants.hpp b/include/kfr/simd/constants.hpp
@@ -0,0 +1,160 @@
+/** @addtogroup constants
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "types.hpp"
+#include <limits>
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4309))
+CMT_PRAGMA_MSVC(warning(disable : 4146))
+
+namespace kfr
+{
+
+#if CMT_COMPILER_GNU
+constexpr double infinity = __builtin_inf();
+constexpr double qnan     = __builtin_nan("");
+#else
+constexpr double infinity = HUGE_VAL;
+constexpr double qnan     = NAN;
+#endif
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Woverflow")
+
+template <typename T>
+struct scalar_constants
+{
+    constexpr static T pi_s(int m, int d = 1) { return pi * m / d; }
+    constexpr static T recip_pi_s(int m, int d = 1) { return recip_pi * m / d; }
+
+    constexpr static T pi           = static_cast<T>(3.1415926535897932384626433832795);
+    constexpr static T sqr_pi       = static_cast<T>(9.8696044010893586188344909998762);
+    constexpr static T recip_pi     = static_cast<T>(0.31830988618379067153776752674503);
+    constexpr static T degtorad     = static_cast<T>(pi / 180);
+    constexpr static T radtodeg     = static_cast<T>(pi * 180);
+    constexpr static T e            = static_cast<T>(2.718281828459045235360287471352662);
+    constexpr static T recip_log_2  = static_cast<T>(1.442695040888963407359924681001892137426645954);
+    constexpr static T recip_log_10 = static_cast<T>(0.43429448190325182765112891891661);
+    constexpr static T log_2        = static_cast<T>(0.69314718055994530941723212145818);
+    constexpr static T log_10       = static_cast<T>(2.3025850929940456840179914546844);
+    constexpr static T sqrt_2       = static_cast<T>(1.4142135623730950488016887242097);
+
+    constexpr static T fold_constant_div = choose_const<T>(
+        CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01));
+
+    constexpr static T fold_constant_hi = choose_const<T>(
+        CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01));
+    constexpr static T fold_constant_rem1 =
+        choose_const<T>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f),
+                        CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08));
+    constexpr static T fold_constant_rem2 =
+        choose_const<T>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f),
+                        CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15));
+
+    constexpr static T epsilon     = std::numeric_limits<T>::epsilon();
+    constexpr static T infinity    = std::numeric_limits<T>::infinity();
+    constexpr static T neginfinity = -std::numeric_limits<T>::infinity();
+    constexpr static T qnan        = std::numeric_limits<T>::quiet_NaN();
+};
+
+template <typename T>
+struct constants : public scalar_constants<subtype<T>>
+{
+public:
+    using Tsub = subtype<T>;
+};
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+/// π (pi)
+/// c_pi<f64, 4>      = 4pi
+/// c_pi<f64, 3, 4>   = 3/4pi
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d);
+
+/// π² (pi²)
+/// c_sqr_pi<f64, 4>      = 4pi²
+/// c_sqr_pi<f64, 3, 4>   = 3/4pi²
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d);
+
+/// 1/π (1/pi)
+/// c_recip_pi<f64>       1/pi
+/// c_recip_pi<f64, 4>    4/pi
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d);
+
+/// degree to radian conversion factor
+template <typename T>
+constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>;
+
+/// radian to degree conversion factor
+template <typename T>
+constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>;
+
+/// e, Euler's number
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d);
+
+template <typename T>
+constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52;
+
+template <typename T>
+constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1;
+
+template <typename T>
+constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon());
+
+/// infinity
+template <typename T>
+constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity();
+
+/// -infinity
+template <typename T>
+constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity();
+
+/// Quiet NaN
+template <typename T>
+constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN();
+
+template <typename T>
+constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954);
+
+template <typename T>
+constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661);
+
+template <typename T>
+constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818);
+
+template <typename T>
+constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844);
+
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d);
+} // namespace kfr
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/simd/digitreverse.hpp b/include/kfr/simd/digitreverse.hpp
@@ -0,0 +1,110 @@
+/** @addtogroup shuffle
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "shuffle.hpp"
+#include "types.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace internal
+{
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative")
+
+constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
+
+template <u32 m, u32 shift, u32... values>
+constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
+{
+    return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>());
+}
+
+template <size_t bits>
+constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>)
+{
+    return bit_permute_step_impl(
+               x,
+               cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
+           (32 - bits);
+}
+
+template <size_t bits>
+constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>)
+{
+    return bit_permute_step_impl(
+               x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
+           (32 - bits);
+}
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+template <size_t radix, size_t bits>
+struct shuffle_index_digitreverse
+{
+    constexpr inline size_t operator()(size_t index) const CMT_NOEXCEPT
+    {
+        return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>());
+    }
+};
+} // namespace internal
+
+template <size_t radix, size_t group = 1, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> digitreverse(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(
+        csizeseq<N / group>.map(internal::shuffle_index_digitreverse<radix, ilog2(N / group)>())));
+}
+
+template <size_t groupsize = 1, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> bitreverse(const vec<T, N>& x)
+{
+    return digitreverse<2, groupsize>(x);
+}
+
+template <size_t groupsize = 1, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> digitreverse4(const vec<T, N>& x)
+{
+    return digitreverse<4, groupsize>(x);
+}
+
+template <size_t bits>
+constexpr inline u32 bitreverse(u32 x)
+{
+    return internal::digitreverse_impl<bits>(x, csize_t<2>());
+}
+
+template <size_t bits>
+constexpr inline u32 digitreverse4(u32 x)
+{
+    return internal::digitreverse_impl<bits>(x, csize_t<4>());
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/horizontal.hpp b/include/kfr/simd/horizontal.hpp
@@ -0,0 +1,138 @@
+/** @addtogroup horizontal
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, typename ReduceFn>
+KFR_INTRINSIC T horizontal_impl(const vec<T, 1>& value, ReduceFn&&)
+{
+    return T(value.front());
+}
+
+template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))>
+KFR_INTRINSIC T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
+{
+    return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce));
+}
+template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))>
+KFR_INTRINSIC T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
+{
+    const T initial = reduce(initialvalue<T>());
+    return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce));
+}
+} // namespace intrinsics
+
+template <typename T, size_t N, typename ReduceFn>
+KFR_INTRINSIC T horizontal(const vec<T, N>& value, ReduceFn&& reduce)
+{
+    return intrinsics::horizontal_impl(value, std::forward<ReduceFn>(reduce));
+}
+
+/// @brief Sum all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hadd(const vec<T, N>& value)
+{
+    return horizontal(value, fn::add());
+}
+KFR_FN(hadd)
+
+/// @brief Sum all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hsum(const vec<T, N>& value)
+{
+    return horizontal(value, fn::add());
+}
+KFR_FN(hsum)
+
+/// @brief Multiply all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hmul(const vec<T, N>& value)
+{
+    return horizontal(value, fn::mul());
+}
+KFR_FN(hmul)
+
+/// @brief Multiply all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hproduct(const vec<T, N>& value)
+{
+    return horizontal(value, fn::mul());
+}
+KFR_FN(hproduct)
+
+template <typename T, size_t N>
+KFR_INTRINSIC T hbitwiseand(const vec<T, N>& value)
+{
+    return horizontal(value, fn::bitwiseand());
+}
+KFR_FN(hbitwiseand)
+template <typename T, size_t N>
+KFR_INTRINSIC T hbitwiseor(const vec<T, N>& value)
+{
+    return horizontal(value, fn::bitwiseor());
+}
+KFR_FN(hbitwiseor)
+template <typename T, size_t N>
+KFR_INTRINSIC T hbitwisexor(const vec<T, N>& value)
+{
+    return horizontal(value, fn::bitwisexor());
+}
+KFR_FN(hbitwisexor)
+
+/// @brief Calculate the Dot-Product of two vectors
+template <typename T, size_t N>
+KFR_INTRINSIC T hdot(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return hadd(x * y);
+}
+KFR_FN(hdot)
+
+/// @brief Calculate the Arithmetic mean of all elements in the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T havg(const vec<T, N>& value)
+{
+    return hadd(value) / N;
+}
+KFR_FN(havg)
+
+/// @brief Calculate the RMS of all elements in the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hrms(const vec<T, N>& value)
+{
+    return builtin_sqrt(hadd(value * value) / N);
+}
+KFR_FN(hrms)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/backend.hpp b/include/kfr/simd/impl/backend.hpp
@@ -0,0 +1,79 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd.hpp"
+#ifdef CMT_CLANG_EXT
+#include "backend_clang.hpp"
+#else
+#include "backend_generic.hpp"
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#ifdef KFR_AUTOTESTS
+template <typename T>
+struct check_sizes
+{
+    static_assert(sizeof(simd<T, 1>) == sizeof(T), "");
+    static_assert(sizeof(simd<T, 2>) == sizeof(T) * 2, "");
+    static_assert(sizeof(simd<T, 3>) == sizeof(T) * 4, "");
+    static_assert(sizeof(simd<T, 4>) == sizeof(T) * 4, "");
+    static_assert(sizeof(simd<T, 5>) == sizeof(T) * 8, "");
+    static_assert(sizeof(simd<T, 6>) == sizeof(T) * 8, "");
+    static_assert(sizeof(simd<T, 7>) == sizeof(T) * 8, "");
+    static_assert(sizeof(simd<T, 8>) == sizeof(T) * 8, "");
+    static_assert(sizeof(simd<T, 16>) == sizeof(T) * 16, "");
+    static_assert(sizeof(simd<T, 32>) == sizeof(T) * 32, "");
+    static_assert(sizeof(simd<T, 64>) == sizeof(T) * 64, "");
+    static_assert(sizeof(simd<T, 128>) == sizeof(T) * 128, "");
+    static_assert(sizeof(simd<T, 256>) == sizeof(T) * 256, "");
+    static_assert(sizeof(simd<T, 512>) == sizeof(T) * 512, "");
+    static_assert(sizeof(simd<T, 513>) == sizeof(T) * 1024, "");
+    static_assert(sizeof(simd<T, 1023>) == sizeof(T) * 1024, "");
+    static_assert(sizeof(simd<T, 1024>) == sizeof(T) * 1024, "");
+};
+
+template struct check_sizes<float>;
+template struct check_sizes<double>;
+template struct check_sizes<uint8_t>;
+template struct check_sizes<uint16_t>;
+template struct check_sizes<uint32_t>;
+template struct check_sizes<uint64_t>;
+template struct check_sizes<int8_t>;
+template struct check_sizes<int16_t>;
+template struct check_sizes<int32_t>;
+template struct check_sizes<int64_t>;
+
+#endif
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+
+using CMT_ARCH_NAME::intrinsics::simd;
+} // namespace kfr
diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp
@@ -0,0 +1,228 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename TT, size_t NN>
+using simd = TT __attribute__((ext_vector_type(NN)));
+
+template <typename T, size_t N1>
+KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x);
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+                                                     const simd<T, Ns>&... z);
+
+template <typename Tout>
+KFR_INTRINSIC void simd_make(ctype_t<Tout>) = delete;
+
+template <typename Tout, typename Arg>
+KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg)
+{
+    return (simd<Tout, 1>){ static_cast<Tout>(arg) };
+}
+
+template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)>
+KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args)
+{
+    return (simd<Tout, N>){ static_cast<Tout>(args)... };
+}
+
+/// @brief Returns vector with undefined value
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_undefined()
+{
+    simd<Tout, N> x;
+    return x;
+}
+
+/// @brief Returns vector with all zeros
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_zeros()
+{
+    return Tout();
+}
+
+/// @brief Returns vector with all ones
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_allones()
+{
+    return special_constants<Tout>::allones();
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout))>
+KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x)
+{
+    return (simd<Tout, Nout>)x;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x)
+{
+    return x;
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>)
+{
+    return value[index];
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x)
+{
+    value[index] = x;
+    return value;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value)
+{
+    return value;
+}
+
+template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>,
+                                         overload_generic)
+{
+    return __builtin_shufflevector(x, x, (indices > N ? -1 : static_cast<int>(indices))...);
+}
+
+template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y,
+                                         csizes_t<indices...>, overload_generic)
+{
+    static_assert(N == N2, "");
+    return __builtin_shufflevector(x, y, (indices > 2 * N ? -1 : static_cast<int>(indices))...);
+}
+
+template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2),
+          size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y,
+                                         csizes_t<indices...>, overload_generic)
+{
+    constexpr size_t Nmax = (N1 > N2 ? N1 : N2);
+    return simd_shuffle(
+        simd2_t<T, Nmax, Nmax>{}, simd_shuffle(simd_t<T, N1>{}, x, csizeseq<Nmax>, overload_auto),
+        simd_shuffle(simd_t<T, N2>{}, y, csizeseq<Nmax>, overload_auto),
+        csizes<(indices < N1 ? indices : indices < N1 + N2 ? indices + (Nmax - N1) : index_undefined)...>,
+        overload_auto);
+}
+
+template <typename T, size_t N1>
+KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x)
+{
+    return x;
+}
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+                                                     const simd<T, Ns>&... z)
+{
+    return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...),
+                        csizeseq<N1 + N2 + Nscount>, overload_auto);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x)
+{
+    return __builtin_convertvector(x, simd<Tout, N>);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x)
+{
+    return x;
+}
+
+template <typename T, size_t N, bool A>
+using simd_storage = struct_with_alignment<simd<T, N>, A>;
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src)
+{
+    return ptr_cast<simd_storage<T, N, A>>(src)->value;
+}
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src)
+{
+    constexpr size_t first        = prev_poweroftwo(N);
+    constexpr size_t rest         = N - first;
+    constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>);
+    constexpr auto concat_indices = cvalseq_t<size_t, N>();
+    return simd_shuffle(
+        simd2_t<T, first, first>{}, simd_read<first, A>(src),
+        simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
+        concat_indices, overload_auto);
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
+{
+    ptr_cast<simd_storage<T, N, A>>(dest)->value = value;
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
+{
+    constexpr size_t first = prev_poweroftwo(N);
+    constexpr size_t rest  = N - first;
+    simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq<first>, overload_auto));
+    simd_write<false, rest>(dest + first,
+                            simd_shuffle(simd_t<T, N>{}, value, csizeseq<rest, first>, overload_auto));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index)
+{
+    return value[index];
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, size_t index, T x)
+{
+    value[index] = x;
+    return value;
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp
@@ -0,0 +1,1080 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wuninitialized")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunknown-warning-option")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized")
+
+namespace kfr
+{
+
+#if KFR_SHOW_NOT_OPTIMIZED
+CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT;
+#else
+#define not_optimized(...)                                                                                   \
+    do                                                                                                       \
+    {                                                                                                        \
+    } while (0)
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+using simd = typename simd_type<T, N>::type;
+
+template <typename T, size_t N, typename U>
+union simd_small_array {
+    static_assert(sizeof(T) * N == sizeof(U), "");
+    T arr[N];
+    U whole;
+
+    KFR_INTRINSIC static constexpr simd_small_array from(U whole)
+    {
+        union {
+            const U w;
+            simd_small_array r;
+        } u{ whole };
+        return u.r;
+    }
+};
+
+#define KFR_SIMD_TYPE(T, N, ...)                                                                             \
+    template <>                                                                                              \
+    struct simd_type<T, N>                                                                                   \
+    {                                                                                                        \
+        using type = __VA_ARGS__;                                                                            \
+    };
+
+#define KFR_SIMD_SMALL_TYPE(T, N, U)                                                                         \
+    template <>                                                                                              \
+    struct simd_type<T, N>                                                                                   \
+    {                                                                                                        \
+        using type = simd_small_array<T, N, U>;                                                              \
+    };
+
+template <typename T>
+struct simd_type<T, 1>
+{
+    using type = T;
+};
+
+template <typename T, size_t N>
+struct simd_type
+{
+    using type = simd_halves<T, N>;
+};
+
+KFR_SIMD_SMALL_TYPE(u8, 2, u16)
+KFR_SIMD_SMALL_TYPE(i8, 2, u16)
+
+KFR_SIMD_SMALL_TYPE(u8, 4, u32)
+KFR_SIMD_SMALL_TYPE(u16, 2, u32)
+KFR_SIMD_SMALL_TYPE(i8, 4, u32)
+KFR_SIMD_SMALL_TYPE(i16, 2, u32)
+
+KFR_SIMD_SMALL_TYPE(u8, 8, u64)
+KFR_SIMD_SMALL_TYPE(u16, 4, u64)
+KFR_SIMD_SMALL_TYPE(u32, 2, u64)
+KFR_SIMD_SMALL_TYPE(i8, 8, u64)
+KFR_SIMD_SMALL_TYPE(i16, 4, u64)
+KFR_SIMD_SMALL_TYPE(i32, 2, u64)
+
+KFR_SIMD_SMALL_TYPE(f32, 2, f64)
+
+#ifdef CMT_ARCH_SSE
+KFR_SIMD_TYPE(f32, 4, __m128)
+KFR_SIMD_TYPE(f64, 2, __m128d)
+#endif // CMT_ARCH_SSE
+
+#ifdef CMT_ARCH_SSE2
+KFR_SIMD_TYPE(u8, 16, __m128i)
+KFR_SIMD_TYPE(u16, 8, __m128i)
+KFR_SIMD_TYPE(u32, 4, __m128i)
+KFR_SIMD_TYPE(u64, 2, __m128i)
+KFR_SIMD_TYPE(i8, 16, __m128i)
+KFR_SIMD_TYPE(i16, 8, __m128i)
+KFR_SIMD_TYPE(i32, 4, __m128i)
+KFR_SIMD_TYPE(i64, 2, __m128i)
+#endif // CMT_ARCH_SSE2
+
+#ifdef CMT_ARCH_AVX
+KFR_SIMD_TYPE(float, 8, __m256)
+KFR_SIMD_TYPE(double, 4, __m256d)
+KFR_SIMD_TYPE(u8, 32, __m256i)
+KFR_SIMD_TYPE(u16, 16, __m256i)
+KFR_SIMD_TYPE(u32, 8, __m256i)
+KFR_SIMD_TYPE(u64, 4, __m256i)
+KFR_SIMD_TYPE(i8, 32, __m256i)
+KFR_SIMD_TYPE(i16, 16, __m256i)
+KFR_SIMD_TYPE(i32, 8, __m256i)
+KFR_SIMD_TYPE(i64, 4, __m256i)
+#endif // CMT_ARCH_AVX
+
+#ifdef CMT_ARCH_AVX512
+KFR_SIMD_TYPE(float, 16, __m512)
+KFR_SIMD_TYPE(double, 8, __m512d)
+KFR_SIMD_TYPE(u8, 64, __m512i)
+KFR_SIMD_TYPE(u16, 32, __m512i)
+KFR_SIMD_TYPE(u32, 16, __m512i)
+KFR_SIMD_TYPE(u64, 8, __m512i)
+KFR_SIMD_TYPE(i8, 64, __m512i)
+KFR_SIMD_TYPE(i16, 32, __m512i)
+KFR_SIMD_TYPE(i32, 16, __m512i)
+KFR_SIMD_TYPE(i64, 8, __m512i)
+#endif // CMT_ARCH_AVX512
+
+#ifdef CMT_ARCH_NEON
+KFR_SIMD_TYPE(u8, 16, uint8x16_t);
+KFR_SIMD_TYPE(u16, 8, uint16x8_t);
+KFR_SIMD_TYPE(u32, 4, uint32x4_t);
+KFR_SIMD_TYPE(u64, 2, uint64x2_t);
+KFR_SIMD_TYPE(i8, 16, int8x16_t);
+KFR_SIMD_TYPE(i16, 8, int16x8_t);
+KFR_SIMD_TYPE(i32, 4, int32x4_t);
+KFR_SIMD_TYPE(i64, 2, int64x2_t);
+KFR_SIMD_TYPE(f32, 4, float32x4_t);
+#ifdef CMT_ARCH_NEON64
+KFR_SIMD_TYPE(f64, 2, float64x2_t);
+#endif // CMT_ARCH_NEON64
+#endif // CMT_ARCH_NEON
+
+#if defined CMT_COMPILER_MSVC
+#define KFR_i8sse_INDEX(x, i) x.m128i_i8[i]
+#define KFR_i16sse_INDEX(x, i) x.m128i_i16[i]
+#define KFR_i32sse_INDEX(x, i) x.m128i_i32[i]
+#define KFR_i64sse_INDEX(x, i) x.m128i_i64[i]
+#define KFR_u8sse_INDEX(x, i) x.m128i_u8[i]
+#define KFR_u16sse_INDEX(x, i) x.m128i_u16[i]
+#define KFR_u32sse_INDEX(x, i) x.m128i_u32[i]
+#define KFR_u64sse_INDEX(x, i) x.m128i_u64[i]
+#define KFR_f32sse_INDEX(x, i) x.m128_f32[i]
+#define KFR_f64sse_INDEX(x, i) x.m128d_f64[i]
+#else
+#define KFR_i8sse_INDEX(x, i) bitcast_anything<simd_array<i8, 16>>(x).val[i]
+#define KFR_i16sse_INDEX(x, i) bitcast_anything<simd_array<i16, 8>>(x).val[i]
+#define KFR_i32sse_INDEX(x, i) _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 1, i)))
+#define KFR_i64sse_INDEX(x, i) _mm_cvtsi128_si64(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, (i)*2 + 1, i * 2)))
+#define KFR_u8sse_INDEX(x, i) bitcast_anything<simd_array<u8, 16>>(x).val[i]
+#define KFR_u16sse_INDEX(x, i) bitcast_anything<simd_array<u16, 8>>(x).val[i]
+#define KFR_u32sse_INDEX(x, i) _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 1, i)))
+#define KFR_u64sse_INDEX(x, i) _mm_cvtsi128_si64(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, (i)*2 + 1, i * 2)))
+#define KFR_f32sse_INDEX(x, i) _mm_cvtss_f32(_mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 2, 1, i)))
+#define KFR_f64sse_INDEX(x, i) _mm_cvtsd_f64(_mm_shuffle_pd(x, x, _MM_SHUFFLE2(1, i)))
+#endif
+
+// specializations
+
+#ifdef KFR_NATIVE_INTRINSICS
+
+#define KFR_GEN_ty(n, ty) ty(n)
+#define KFR_GEN_arg_def(n, ty) ty arg##n
+#define KFR_GEN_arg(n, ty) arg##n
+
+#define KFR_INTRIN_MAKE(n, ty, intrin)                                                                       \
+    KFR_INTRINSIC simd<ty, n> simd_make(ctype_t<ty>, CMT_GEN_LIST(n, KFR_GEN_arg_def, ty)) CMT_NOEXCEPT      \
+    {                                                                                                        \
+        return intrin(CMT_GEN_LIST(n, KFR_GEN_arg, ty));                                                     \
+    }
+
+#ifdef CMT_ARCH_SSE2
+inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); }
+KFR_INTRIN_MAKE(2, i64, KFR_mm_setr_epi64x)
+KFR_INTRIN_MAKE(2, u64, KFR_mm_setr_epi64x)
+KFR_INTRIN_MAKE(2, f64, _mm_setr_pd)
+KFR_INTRIN_MAKE(4, i32, _mm_setr_epi32)
+KFR_INTRIN_MAKE(4, u32, _mm_setr_epi32)
+KFR_INTRIN_MAKE(4, f32, _mm_setr_ps)
+KFR_INTRIN_MAKE(8, i16, _mm_setr_epi16)
+KFR_INTRIN_MAKE(8, u16, _mm_setr_epi16)
+KFR_INTRIN_MAKE(16, i8, _mm_setr_epi8)
+KFR_INTRIN_MAKE(16, u8, _mm_setr_epi8)
+
+#define KFR_INTRIN_BITCAST(Tout, Tin, N, ...)                                                                \
+    KFR_INTRINSIC simd<Tout, N> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT   \
+    {                                                                                                        \
+        return __VA_ARGS__;                                                                                  \
+    }
+KFR_INTRIN_BITCAST(f32, i32, 4, _mm_castsi128_ps(x))
+KFR_INTRIN_BITCAST(i32, f32, 4, _mm_castps_si128(x))
+KFR_INTRIN_BITCAST(f64, i64, 2, _mm_castsi128_pd(x))
+KFR_INTRIN_BITCAST(i64, f64, 2, _mm_castpd_si128(x))
+
+#define KFR_INTRIN_BROADCAST(T, N, ...)                                                                      \
+    KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, T value) CMT_NOEXCEPT { return __VA_ARGS__; }
+
+KFR_INTRIN_BROADCAST(i8, 16, _mm_set1_epi8(value))
+KFR_INTRIN_BROADCAST(i16, 8, _mm_set1_epi16(value))
+KFR_INTRIN_BROADCAST(i32, 4, _mm_set1_epi32(value))
+KFR_INTRIN_BROADCAST(i64, 2, _mm_set1_epi64x(value))
+KFR_INTRIN_BROADCAST(u8, 16, _mm_set1_epi8(value))
+KFR_INTRIN_BROADCAST(u16, 8, _mm_set1_epi16(value))
+KFR_INTRIN_BROADCAST(u32, 4, _mm_set1_epi32(value))
+KFR_INTRIN_BROADCAST(u64, 2, _mm_set1_epi64x(value))
+KFR_INTRIN_BROADCAST(f32, 4, _mm_set1_ps(value))
+KFR_INTRIN_BROADCAST(f64, 2, _mm_set1_pd(value))
+
+#define KFR_INTRIN_SHUFFLE_SWAP(T, N, ...)                                                                   \
+    KFR_INTRINSIC simd<T, N> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N> ^ csize<1>,       \
+                                          overload_priority<9>) CMT_NOEXCEPT                                 \
+    {                                                                                                        \
+        return __VA_ARGS__;                                                                                  \
+    }
+
+#define KFR_INTRIN_SHUFFLE_LINEAR(T, Nout, Nin, ...)                                                         \
+    KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, csizeseq_t<Nout>,        \
+                                             overload_priority<9>) CMT_NOEXCEPT                              \
+    {                                                                                                        \
+        return __VA_ARGS__;                                                                                  \
+    }
+#define KFR_INTRIN_SHUFFLE_LINEAR_START(T, Nout, Nin, Nstart, ...)                                           \
+    KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x,                          \
+                                             csizeseq_t<Nout, Nstart>, overload_priority<9>) CMT_NOEXCEPT    \
+    {                                                                                                        \
+        return __VA_ARGS__;                                                                                  \
+    }
+
+#define KFR_INTRIN_SHUFFLE_CONCAT(T, Nin, ...)                                                               \
+    KFR_INTRINSIC simd<T, Nin + Nin> simd_shuffle(simd2_t<T, Nin, Nin>, const simd<T, Nin>& x,               \
+                                                  const simd<T, Nin>& y, csizeseq_t<Nin + Nin>,              \
+                                                  overload_priority<9>) CMT_NOEXCEPT                         \
+    {                                                                                                        \
+        return __VA_ARGS__;                                                                                  \
+    }
+
+// extend
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 1, _mm_cvtsi32_si128(u8(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 1, _mm_cvtsi32_si128(u16(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 1, _mm_cvtsi64_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 1, _mm_cvtsi64_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 1, _mm_set_ss(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 1, _mm_set_sd(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 4, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 4, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 8, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 8, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 4, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 4, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 2, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 2, _mm_cvtsi64_si128(x.whole))
+
+// slice
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 1, 4, _mm_cvtsi128_si32(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 1, 4, _mm_cvtsi128_si32(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 4, 16, simd<i8, 4>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 8, 16, simd<i8, 8>::from(_mm_cvtsi128_si64(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 2, 16, simd<u8, 2>::from(u16(_mm_cvtsi128_si32(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 4, 16, simd<u8, 4>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 8, 16, simd<u8, 8>::from(_mm_cvtsi128_si64(x)))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 2, 8, simd<i16, 2>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 4, 8, simd<i16, 4>::from(_mm_cvtsi128_si64(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 2, 8, simd<u16, 2>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 4, 8, simd<u16, 4>::from(_mm_cvtsi128_si64(x)))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 2, 4, simd<i32, 2>::from(_mm_cvtsi128_si64(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 2, 4, simd<u32, 2>::from(_mm_cvtsi128_si64(x)))
+
+// high
+KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 8, 16, 8, simd<u8, 8>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 8, 16, 8, simd<i8, 8>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 4, 8, 4, simd<u16, 4>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 4, 8, 4, simd<i16, 4>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 2, 4, 2, simd<u32, 2>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDEX(x, 1)))
+
+#define KFR_INTRIN_CONVERT(Tout, Tin, N, ...)                                                                \
+    KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT   \
+    {                                                                                                        \
+        return __VA_ARGS__;                                                                                  \
+    }
+
+KFR_INTRIN_CONVERT(f32, i32, 4, _mm_cvtepi32_ps(x))
+KFR_INTRIN_CONVERT(i32, f32, 4, _mm_cvttps_epi32(x))
+KFR_INTRIN_CONVERT(i32, f64, 2, simd<i32, 2>::from(_mm_cvtsi128_si64(_mm_cvttpd_epi32(x))))
+KFR_INTRIN_CONVERT(f64, i32, 2, _mm_cvtepi32_pd(KFR_mm_setr_epi64x(x.whole, 0)))
+KFR_INTRIN_CONVERT(i64, f64, 2, _mm_set_epi64x(_mm_cvttsd_si64(_mm_unpackhi_pd(x, x)), _mm_cvttsd_si64(x)))
+KFR_INTRIN_CONVERT(f64, i64, 2,
+                   _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), _mm_cvtsi128_si64(x)),
+                                   _mm_cvtsi64_sd(_mm_setzero_pd(), KFR_i64sse_INDEX(x, 1))))
+#ifdef CMT_ARCH_AVX
+KFR_INTRIN_CONVERT(f64, f32, 4, _mm256_cvtps_pd(x))
+#else
+KFR_INTRIN_CONVERT(f64, f32, 4,
+                   simd<f64, 4>{ _mm_cvtps_pd(x),
+                                 _mm_cvtps_pd(_mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2))) })
+#endif
+#ifdef CMT_ARCH_AVX
+KFR_INTRIN_CONVERT(f32, f64, 4, _mm256_cvtpd_ps(x))
+#else
+KFR_INTRIN_CONVERT(f32, f64, 4,
+                   simd<f32, 4>{ _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtpd_ps(x.low)),
+                                                               _mm_castps_pd(_mm_cvtpd_ps(x.high)))) })
+#endif
+#endif // CMT_ARCH_SSE2
+
+#ifdef CMT_ARCH_SSE41
+
+KFR_INTRIN_CONVERT(i16, i8, 8, _mm_cvtepi8_epi16(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u16, u8, 8, _mm_cvtepu8_epi16(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i32, i16, 4, _mm_cvtepi16_epi32(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u32, u16, 4, _mm_cvtepu16_epi32(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(i32, i8, 4, _mm_cvtepi8_epi32(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u32, u8, 4, _mm_cvtepu8_epi32(_mm_cvtsi32_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i64, i32, 2, _mm_cvtepi32_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u32, 2, _mm_cvtepu32_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(i64, i16, 2, _mm_cvtepi16_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u16, 2, _mm_cvtepu16_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(i64, i8, 2, _mm_cvtepi8_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u8, 2, _mm_cvtepu8_epi64(_mm_cvtsi32_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(f32, i8, 4, _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, i16, 4, _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_cvtsi64_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, u8, 4, _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, u16, 4, _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_cvtsi64_si128(x.whole))))
+
+#ifndef CMT_ARCH_AVX
+KFR_INTRIN_CONVERT(i64, i32, 4,
+                   simd<i64, 4>{ _mm_cvtepi32_epi64(x),
+                                 _mm_cvtepi32_epi64(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2))) })
+#endif
+#endif
+
+#ifdef CMT_ARCH_AVX
+KFR_INTRIN_MAKE(4, f64, _mm256_setr_pd)
+KFR_INTRIN_MAKE(8, f32, _mm256_setr_ps)
+
+KFR_INTRIN_BITCAST(f32, i32, 8, _mm256_castsi256_ps(x))
+
+KFR_INTRIN_BITCAST(i32, f32, 8, _mm256_castps_si256(x))
+KFR_INTRIN_BITCAST(f64, i64, 4, _mm256_castsi256_pd(x))
+KFR_INTRIN_BITCAST(i64, f64, 4, _mm256_castpd_si256(x))
+
+KFR_INTRINSIC __m256 KFR_mm256_setr_m128(__m128 x, __m128 y)
+{
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(x), y, 1);
+}
+
+KFR_INTRINSIC __m256d KFR_mm256_setr_m128d(__m128d x, __m128d y)
+{
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(x), y, 1);
+}
+KFR_INTRINSIC __m256i KFR_mm256_setr_m128i(__m128i x, __m128i y)
+{
+#ifdef CMT_ARCH_AVX2
+    return _mm256_inserti128_si256(_mm256_castsi128_si256(x), y, 1);
+#else
+    return _mm256_castps_si256(
+        _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(x)), _mm_castsi128_ps(y), 1));
+#endif
+}
+
+KFR_INTRIN_SHUFFLE_CONCAT(f32, 4, KFR_mm256_setr_m128(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(f64, 2, KFR_mm256_setr_m128d(x, y))
+
+// concat
+KFR_INTRIN_SHUFFLE_CONCAT(i8, 16, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i16, 8, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i32, 4, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i64, 2, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u8, 16, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u16, 8, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u32, 4, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u64, 2, KFR_mm256_setr_m128i(x, y))
+// low
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 8, _mm256_castps256_ps128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 4, _mm256_castpd256_pd128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 32, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 16, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 8, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 4, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 32, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 16, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 8, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 4, _mm256_castsi256_si128(x))
+
+// extend
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 4, _mm256_castps128_ps256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 2, _mm256_castpd128_pd256(x))
+
+// high
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4, 8, 4, _mm256_extractf128_ps(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2, 4, 2, _mm256_extractf128_pd(x, 1))
+
+KFR_INTRIN_BROADCAST(f32, 8, _mm256_set1_ps(value))
+KFR_INTRIN_BROADCAST(f64, 4, _mm256_set1_pd(value))
+
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 8, 1, _mm256_castps128_ps256(_mm_set_ss(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 4, 1, _mm256_castpd128_pd256(_mm_set_sd(x)))
+#endif // CMT_ARCH_AVX
+
+#ifdef CMT_ARCH_AVX2
+KFR_INTRIN_MAKE(4, i64, _mm256_setr_epi64x)
+KFR_INTRIN_MAKE(4, u64, _mm256_setr_epi64x)
+KFR_INTRIN_MAKE(8, i32, _mm256_setr_epi32)
+KFR_INTRIN_MAKE(8, u32, _mm256_setr_epi32)
+KFR_INTRIN_MAKE(16, i16, _mm256_setr_epi16)
+KFR_INTRIN_MAKE(16, u16, _mm256_setr_epi16)
+KFR_INTRIN_MAKE(32, i8, _mm256_setr_epi8)
+KFR_INTRIN_MAKE(32, u8, _mm256_setr_epi8)
+
+KFR_INTRIN_CONVERT(i16, i8, 16, _mm256_cvtepi8_epi16(x))
+KFR_INTRIN_CONVERT(u16, u8, 16, _mm256_cvtepu8_epi16(x))
+
+KFR_INTRIN_CONVERT(i32, i16, 8, _mm256_cvtepi16_epi32(x))
+KFR_INTRIN_CONVERT(u32, u16, 8, _mm256_cvtepu16_epi32(x))
+KFR_INTRIN_CONVERT(i32, i8, 8, _mm256_cvtepi8_epi32(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u32, u8, 8, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i64, i32, 4, _mm256_cvtepi32_epi64(x))
+KFR_INTRIN_CONVERT(u64, u32, 4, _mm256_cvtepu32_epi64(x))
+KFR_INTRIN_CONVERT(i64, i16, 4, _mm256_cvtepi16_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u16, 4, _mm256_cvtepu16_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(i64, i8, 4, _mm256_cvtepi8_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u8, 4, _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(f32, i8, 8, _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, i16, 8, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(x)))
+KFR_INTRIN_CONVERT(f32, u8, 8, _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, u16, 8, _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(x)))
+
+KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 16, 32, 16, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 8, 16, 8, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4, 8, 4, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2, 4, 2, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 16, 32, 16, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 8, 16, 8, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 4, 8, 4, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u64, 2, 4, 2, _mm256_extracti128_si256(x, 1))
+
+KFR_INTRIN_BROADCAST(i8, 32, _mm256_set1_epi8(value))
+KFR_INTRIN_BROADCAST(i16, 16, _mm256_set1_epi16(value))
+KFR_INTRIN_BROADCAST(i32, 8, _mm256_set1_epi32(value))
+KFR_INTRIN_BROADCAST(i64, 4, _mm256_set1_epi64x(value))
+KFR_INTRIN_BROADCAST(u8, 32, _mm256_set1_epi8(value))
+KFR_INTRIN_BROADCAST(u16, 16, _mm256_set1_epi16(value))
+KFR_INTRIN_BROADCAST(u32, 8, _mm256_set1_epi32(value))
+KFR_INTRIN_BROADCAST(u64, 4, _mm256_set1_epi64x(value))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 16, _mm256_castsi128_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 8, _mm256_castsi128_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 4, _mm256_castsi128_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 2, _mm256_castsi128_si256(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u8(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u16(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i32, f32, 8, _mm256_cvttps_epi32(x))
+KFR_INTRIN_CONVERT(f32, i32, 8, _mm256_cvtepi32_ps(x))
+KFR_INTRIN_CONVERT(f64, i32, 4, _mm256_cvtepi32_pd(x))
+KFR_INTRIN_CONVERT(i32, f64, 4, _mm256_cvttpd_epi32(x))
+#endif // CMT_ARCH_AVX2
+
+#ifdef CMT_ARCH_AVX512
+
+static inline __m512d KFR_mm512_setr_pd(f64 x0, f64 x1, f64 x2, f64 x3, f64 x4, f64 x5, f64 x6, f64 x7)
+{
+    return _mm512_set_pd(x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512 KFR_mm512_setr_ps(f32 x0, f32 x1, f32 x2, f32 x3, f32 x4, f32 x5, f32 x6, f32 x7, f32 x8,
+                                       f32 x9, f32 x10, f32 x11, f32 x12, f32 x13, f32 x14, f32 x15)
+{
+    return _mm512_set_ps(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi64(i64 x0, i64 x1, i64 x2, i64 x3, i64 x4, i64 x5, i64 x6, i64 x7)
+{
+    return _mm512_set_epi64(x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi32(i32 x0, i32 x1, i32 x2, i32 x3, i32 x4, i32 x5, i32 x6, i32 x7,
+                                           i32 x8, i32 x9, i32 x10, i32 x11, i32 x12, i32 x13, i32 x14,
+                                           i32 x15)
+{
+    return _mm512_set_epi32(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi16(i16 x0, i16 x1, i16 x2, i16 x3, i16 x4, i16 x5, i16 x6, i16 x7,
+                                           i16 x8, i16 x9, i16 x10, i16 x11, i16 x12, i16 x13, i16 x14,
+                                           i16 x15, i16 x16, i16 x17, i16 x18, i16 x19, i16 x20, i16 x21,
+                                           i16 x22, i16 x23, i16 x24, i16 x25, i16 x26, i16 x27, i16 x28,
+                                           i16 x29, i16 x30, i16 x31)
+{
+    return _mm512_set_epi16(x31, x30, x29, x28, x27, x26, x25, x24, x23, x22, x21, x20, x19, x18, x17, x16,
+                            x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi8(i8 x0, i8 x1, i8 x2, i8 x3, i8 x4, i8 x5, i8 x6, i8 x7, i8 x8,
+                                          i8 x9, i8 x10, i8 x11, i8 x12, i8 x13, i8 x14, i8 x15, i8 x16,
+                                          i8 x17, i8 x18, i8 x19, i8 x20, i8 x21, i8 x22, i8 x23, i8 x24,
+                                          i8 x25, i8 x26, i8 x27, i8 x28, i8 x29, i8 x30, i8 x31, i8 x32,
+                                          i8 x33, i8 x34, i8 x35, i8 x36, i8 x37, i8 x38, i8 x39, i8 x40,
+                                          i8 x41, i8 x42, i8 x43, i8 x44, i8 x45, i8 x46, i8 x47, i8 x48,
+                                          i8 x49, i8 x50, i8 x51, i8 x52, i8 x53, i8 x54, i8 x55, i8 x56,
+                                          i8 x57, i8 x58, i8 x59, i8 x60, i8 x61, i8 x62, i8 x63)
+{
+    return _mm512_set_epi8(x63, x62, x61, x60, x59, x58, x57, x56, x55, x54, x53, x52, x51, x50, x49, x48,
+                           x47, x46, x45, x44, x43, x42, x41, x40, x39, x38, x37, x36, x35, x34, x33, x32,
+                           x31, x30, x29, x28, x27, x26, x25, x24, x23, x22, x21, x20, x19, x18, x17, x16,
+                           x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+
+KFR_INTRINSIC __m512 KFR_mm512_setr_m256(__m256 x, __m256 y)
+{
+    return _mm512_insertf32x8(_mm512_castps256_ps512(x), y, 1);
+}
+
+KFR_INTRINSIC __m512d KFR_mm512_setr_m256d(__m256d x, __m256d y)
+{
+    return _mm512_insertf64x4(_mm512_castpd256_pd512(x), y, 1);
+}
+KFR_INTRINSIC __m512i KFR_mm512_setr_m256i(__m256i x, __m256i y)
+{
+    return _mm512_inserti32x8(_mm512_castsi256_si512(x), y, 1);
+}
+
+KFR_INTRIN_MAKE(8, f64, KFR_mm512_setr_pd)
+KFR_INTRIN_MAKE(16, f32, KFR_mm512_setr_ps)
+
+KFR_INTRIN_MAKE(8, i64, KFR_mm512_setr_epi64)
+KFR_INTRIN_MAKE(8, u64, KFR_mm512_setr_epi64)
+KFR_INTRIN_MAKE(16, i32, KFR_mm512_setr_epi32)
+KFR_INTRIN_MAKE(16, u32, KFR_mm512_setr_epi32)
+KFR_INTRIN_MAKE(32, i16, KFR_mm512_setr_epi16)
+KFR_INTRIN_MAKE(32, u16, KFR_mm512_setr_epi16)
+KFR_INTRIN_MAKE(64, i8, KFR_mm512_setr_epi8)
+KFR_INTRIN_MAKE(64, u8, KFR_mm512_setr_epi8)
+
+KFR_INTRIN_BROADCAST(f32, 16, _mm512_set1_ps(value))
+KFR_INTRIN_BROADCAST(f64, 8, _mm512_set1_pd(value))
+
+KFR_INTRIN_BROADCAST(i8, 64, _mm512_set1_epi8(value))
+KFR_INTRIN_BROADCAST(i16, 32, _mm512_set1_epi16(value))
+KFR_INTRIN_BROADCAST(i32, 16, _mm512_set1_epi32(value))
+KFR_INTRIN_BROADCAST(i64, 8, _mm512_set1_epi64(value))
+KFR_INTRIN_BROADCAST(u8, 64, _mm512_set1_epi8(value))
+KFR_INTRIN_BROADCAST(u16, 32, _mm512_set1_epi16(value))
+KFR_INTRIN_BROADCAST(u32, 16, _mm512_set1_epi32(value))
+KFR_INTRIN_BROADCAST(u64, 8, _mm512_set1_epi64(value))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u8(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u16(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i32, f32, 16, _mm512_cvttps_epi32(x))
+KFR_INTRIN_CONVERT(f32, i32, 16, _mm512_cvtepi32_ps(x))
+KFR_INTRIN_CONVERT(f64, i32, 8, _mm512_cvtepi32_pd(x))
+KFR_INTRIN_CONVERT(i32, f64, 8, _mm512_cvttpd_epi32(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 4, _mm512_castps128_ps512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2, _mm512_castpd128_pd512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 16, _mm512_castsi128_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 8, _mm512_castsi128_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 4, _mm512_castsi128_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2, _mm512_castsi128_si512(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 2 * 4, _mm512_castps256_ps512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2 * 2, _mm512_castpd256_pd512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2 * 16, _mm512_castsi256_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2 * 8, _mm512_castsi256_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2 * 4, _mm512_castsi256_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2 * 2, _mm512_castsi256_si512(x))
+
+// low
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 8 * 2, _mm512_castps512_ps256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 4 * 2, _mm512_castpd512_pd256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x))
+
+// high
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4 * 2, 8 * 2, 4 * 2, _mm512_extractf32x8_ps(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2 * 2, 4 * 2, 2 * 2, _mm512_extractf64x4_pd(x, 1))
+
+KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4 * 2, 8 * 2, 4 * 2, _mm512_extracti32x8_epi32(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2 * 2, 4 * 2, 2 * 2, _mm512_extracti64x4_epi64(x, 1))
+
+// concat
+KFR_INTRIN_SHUFFLE_CONCAT(f32, 4 * 2, KFR_mm512_setr_m256(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(f64, 2 * 2, KFR_mm512_setr_m256d(x, y))
+
+KFR_INTRIN_SHUFFLE_CONCAT(i8, 16 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i16, 8 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i32, 4 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i64, 2 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u8, 16 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u16, 8 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u32, 4 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u64, 2 * 2, KFR_mm512_setr_m256i(x, y))
+#endif
+
+#endif
+
+// generic functions
+
+template <typename T, size_t N1>
+KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT;
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+                                                     const simd<T, Ns>&... z) CMT_NOEXCEPT;
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd_array<T, N> to_simd_array(const simd<T, N>& x) CMT_NOEXCEPT
+{
+    return bitcast_anything<simd_array<T, N>>(x);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
+{
+    return bitcast_anything<simd<T, N>>(x);
+}
+
+#define KFR_COMPONENTWISE_RET(code)                                                                          \
+    vec<T, N> result;                                                                                        \
+    for (size_t i = 0; i < N; i++)                                                                           \
+        code;                                                                                                \
+    return result;
+
+#define KFR_COMPONENTWISE_RET_I(Tvec, code)                                                                  \
+    Tvec result;                                                                                             \
+    for (size_t i = 0; i < result.size(); i++)                                                               \
+        code;                                                                                                \
+    return result;
+
+#define KFR_COMPONENTWISE(code)                                                                              \
+    for (size_t i = 0; i < N; i++)                                                                           \
+        code;
+
+template <typename Tout>
+KFR_INTRINSIC void simd_make(ctype_t<Tout>) CMT_NOEXCEPT = delete;
+
+template <typename Tout, typename Arg>
+KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) CMT_NOEXCEPT
+{
+    return simd<Tout, 1>{ static_cast<Tout>(arg) };
+}
+
+template <typename T, size_t... indices, typename... Args, size_t N = sizeof...(indices)>
+KFR_INTRINSIC simd<T, N> simd_make_helper(csizes_t<indices...>, const Args&... args) CMT_NOEXCEPT;
+
+template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)>
+KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args) CMT_NOEXCEPT
+{
+    constexpr size_t Nlow = prev_poweroftwo(N - 1);
+    return simd_concat<Tout, Nlow, N - Nlow>(simd_make_helper<Tout>(csizeseq<Nlow>, args...),
+                                             simd_make_helper<Tout>(csizeseq<N - Nlow, Nlow>, args...));
+}
+
+template <typename T, size_t... indices, typename... Args, size_t N>
+KFR_INTRINSIC simd<T, N> simd_make_helper(csizes_t<indices...>, const Args&... args) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    const T temp[] = { static_cast<T>(args)... };
+    return simd_make(ctype<T>, temp[indices]...);
+}
+
+/// @brief Returns vector with undefined value
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_undefined() CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    simd<Tout, N> x;
+    return x;
+}
+
+/// @brief Returns vector with all zeros
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_zeros() CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    return from_simd_array<Tout, N>({ Tout() });
+}
+
+/// @brief Returns vector with all ones
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_allones() CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    simd_array<Tout, N> x{};
+    KFR_COMPONENTWISE(x.val[i] = special_constants<Tout>::allones());
+    return from_simd_array(x);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout)),
+          KFR_ENABLE_IF(Nout == 1 || N == 1)>
+KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    return bitcast_anything<simd<Tout, Nout>>(x);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout)),
+          KFR_ENABLE_IF(Nout > 1 && N > 1)>
+KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT
+{
+    constexpr size_t Nlow = prev_poweroftwo(N - 1);
+    return simd_concat<Tout, Nlow * Nout / N, (N - Nlow) * Nout / N>(
+        simd_bitcast(simd_cvt_t<Tout, Tin, Nlow>{},
+                     simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)),
+        simd_bitcast(simd_cvt_t<Tout, Tin, N - Nlow>{},
+                     simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto)));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT
+{
+    return x;
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    return to_simd_array<T, N>(value).val[index];
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    simd_array<T, N> arr = to_simd_array<T, N>(value);
+    arr.val[index]       = x;
+    return from_simd_array(arr);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC const simd<T, N>& simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N>,
+                                             overload_priority<10>) CMT_NOEXCEPT
+{
+    return x;
+}
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC const simd<T, N1>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>&,
+                                              csizeseq_t<N1>, overload_priority<9>) CMT_NOEXCEPT
+{
+    return x;
+}
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC const simd<T, N2>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>&, const simd<T, N2>& y,
+                                              csizeseq_t<N2, N1>, overload_priority<9>) CMT_NOEXCEPT
+{
+    return y;
+}
+
+// concat()
+template <typename T, size_t N,
+          KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N + N>, simd_halves<T, N + N>>::value)>
+KFR_INTRINSIC simd<T, N + N> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y,
+                                          csizeseq_t<N + N>, overload_priority<8>) CMT_NOEXCEPT
+{
+    return simd<T, N + N>{ x, y };
+}
+
+template <typename T>
+KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_t<T, 1>, identity<T> value) CMT_NOEXCEPT
+{
+    return { value };
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N >= 2), size_t Nlow = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value) CMT_NOEXCEPT
+{
+    return simd_concat<T, Nlow, N - Nlow>(simd_broadcast(simd_t<T, Nlow>{}, value),
+                                          simd_broadcast(simd_t<T, N - Nlow>{}, value));
+}
+
+template <typename T, size_t N,
+          KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N>, simd_halves<T, N>>::value)>
+KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2>,
+                                          overload_priority<7>) CMT_NOEXCEPT
+{
+    return x.low;
+}
+
+template <typename T, size_t N,
+          KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N>, simd_halves<T, N>>::value)>
+KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2, N / 2>,
+                                          overload_priority<7>) CMT_NOEXCEPT
+{
+    return x.high;
+}
+
+template <typename T, size_t Nout, size_t N>
+simd_array<T, Nout> simd_shuffle_generic(const simd_array<T, N>& x, const unsigned (&indices)[Nout])
+{
+    simd_array<T, Nout> result;
+    for (size_t i = 0; i < Nout; ++i)
+    {
+        const size_t index = indices[i];
+        result.val[i]      = index >= N ? T() : x.val[index];
+    }
+    return result;
+}
+
+template <typename T, size_t Nout, size_t N1, size_t N2>
+simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd_array<T, N2>& y,
+                                          const unsigned (&indices)[Nout])
+{
+    simd_array<T, Nout> result;
+    for (size_t i = 0; i < Nout; ++i)
+    {
+        const size_t index = indices[i];
+        result.val[i]      = index > N1 + N2 ? T() : index >= N1 ? y.val[index - N1] : x.val[index];
+    }
+    return result;
+}
+
+template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>,
+                                         overload_generic) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+#ifdef CMT_COMPILER_MSVC
+    const simd_array<T, N> xx                 = to_simd_array<T, N>(x);
+    constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
+    return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, N>(xx, indices_array));
+#else
+    return from_simd_array<T, Nout>({ (indices > N ? T() : to_simd_array<T, N>(x).val[indices])... });
+#endif
+}
+
+template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y,
+                                         csizes_t<indices...>, overload_generic) CMT_NOEXCEPT
+{
+    static_assert(N == N2, "");
+    not_optimized(CMT_FUNC_SIGNATURE);
+#ifdef CMT_COMPILER_MSVC
+    const simd_array<T, N> xx                 = to_simd_array<T, N>(x);
+    const simd_array<T, N> yy                 = to_simd_array<T, N>(y);
+    constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
+    return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N, N>(xx, yy, indices_array));
+#else
+    return from_simd_array<T, Nout>(
+        { (indices > N * 2 ? T()
+                           : indices >= N ? to_simd_array<T, N>(y).val[indices - N]
+                                          : to_simd_array<T, N>(x).val[indices])... });
+#endif
+}
+
+template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2),
+          size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y,
+                                         csizes_t<indices...>, overload_generic) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+
+#ifdef CMT_COMPILER_MSVC
+    const simd_array<T, N1> xx                = to_simd_array<T, N1>(x);
+    const simd_array<T, N2> yy                = to_simd_array<T, N2>(y);
+    constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
+    return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N1, N2>(xx, yy, indices_array));
+#else
+
+    return from_simd_array<T, Nout>(
+        { (indices > N1 + N2 ? T()
+                             : indices >= N1 ? to_simd_array<T, N2>(y).val[indices - N1]
+                                             : to_simd_array<T, N1>(x).val[indices])... });
+#endif
+}
+
+template <typename T, size_t N1>
+KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT
+{
+    return x;
+}
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+                                                     const simd<T, Ns>&... z) CMT_NOEXCEPT
+{
+    return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...),
+                        csizeseq<N1 + N2 + Nscount>, overload_auto);
+}
+
+template <typename Tout, typename Tin, size_t N, size_t... indices>
+KFR_INTRINSIC simd<Tout, N> simd_convert__(const simd<Tin, N>& x, csizes_t<indices...>) CMT_NOEXCEPT
+{
+    const simd_array<Tin, N> xx = to_simd_array<Tin, N>(x);
+    return simd_make(ctype<Tout>, static_cast<Tout>(xx.val[indices])...);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin>
+KFR_INTRINSIC simd<Tout, 1> simd_convert(simd_cvt_t<Tout, Tin, 1>, const simd<Tin, 1>& x) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    return simd_make(ctype<Tout>, static_cast<Tout>(x));
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT
+{
+    constexpr size_t Nlow = prev_poweroftwo(N - 1);
+    return simd_concat<Tout, Nlow, N - Nlow>(
+        simd_convert(simd_cvt_t<Tout, Tin, Nlow>{},
+                     simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)),
+        simd_convert(simd_cvt_t<Tout, Tin, N - Nlow>{},
+                     simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto)));
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename T, size_t N>
+KFR_INTRINSIC const simd<T, N>& simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT
+{
+    return x;
+}
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-attributes")
+
+template <typename T, size_t N, bool A>
+using simd_storage = struct_with_alignment<simd<T, N>, A>;
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
+{
+    return reinterpret_cast<typename simd_storage<T, N, A>::const_pointer>(src)->value;
+}
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
+{
+    constexpr size_t first = prev_poweroftwo(N);
+    constexpr size_t rest  = N - first;
+    constexpr auto extend_indices =
+        cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>());
+    constexpr auto concat_indices = cvalseq_t<size_t, N>();
+    return simd_shuffle(
+        simd2_t<T, first, first>{}, simd_read<first, A>(src),
+        simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
+        concat_indices, overload_auto);
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
+{
+    reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = value;
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
+{
+    constexpr size_t first = prev_poweroftwo(N);
+    constexpr size_t rest  = N - first;
+    simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<first>(), overload_auto));
+    simd_write<false, rest>(dest + first,
+                            simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<rest, first>(), overload_auto));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    return to_simd_array<T, N>(value).val[index];
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, T x) CMT_NOEXCEPT
+{
+    not_optimized(CMT_FUNC_SIGNATURE);
+    simd_array<T, N> arr = to_simd_array<T, N>(value);
+    arr.val[index]       = x;
+    return from_simd_array(arr);
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/impl/basicoperators_clang.hpp b/include/kfr/simd/impl/basicoperators_clang.hpp
@@ -0,0 +1,178 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../mask.hpp"
+#include "function.hpp"
+#include <algorithm>
+#include <utility>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x)
+{
+    return -x.v;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bnot(const vec<T, N>& x)
+{
+    return simd_bitcast(simd_cvt_t<T, utype<T>, N>{}, ~simd_bitcast(simd_cvt_t<utype<T>, T, N>{}, x.v));
+}
+
+#define KFR_OP_SCALAR2(fn, op, resultprefix, operprefix, soperprefix)                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>                                   \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& x, const T& y)                                               \
+    {                                                                                                        \
+        return resultprefix(operprefix(x.v) op soperprefix(y));                                              \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>                                   \
+    KFR_INTRINSIC vec<T, N> fn(const T& x, const vec<T, N>& y)                                               \
+    {                                                                                                        \
+        return resultprefix(soperprefix(x) op operprefix(y.v));                                              \
+    }
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> add(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.v + y.v;
+}
+KFR_OP_SCALAR2(add, +, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> sub(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.v - y.v;
+}
+KFR_OP_SCALAR2(sub, -, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> mul(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.v * y.v;
+}
+KFR_OP_SCALAR2(mul, *, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> div(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.v / y.v;
+}
+KFR_OP_SCALAR2(div, /, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> band(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)((simd<utype<T>, N>)(x.v) & (simd<utype<T>, N>)(y.v));
+}
+KFR_OP_SCALAR2(band, &, (simd<T, N>), (simd<utype<T>, N>), ubitcast)
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bor(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)((simd<utype<T>, N>)(x.v) | (simd<utype<T>, N>)(y.v));
+}
+KFR_OP_SCALAR2(bor, |, (simd<T, N>), (simd<utype<T>, N>), ubitcast)
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bxor(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)((simd<utype<T>, N>)(x.v) ^ (simd<utype<T>, N>)(y.v));
+}
+KFR_OP_SCALAR2(bxor, ^, (simd<T, N>), (simd<utype<T>, N>), ubitcast)
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+    return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) << y.v);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+    return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) >> y.v);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, unsigned y)
+{
+    return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) << y);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y)
+{
+    return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) >> y);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)(x.v == y.v);
+}
+KFR_OP_SCALAR2(eq, ==, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)(x.v != y.v);
+}
+KFR_OP_SCALAR2(ne, !=, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)(x.v <= y.v);
+}
+KFR_OP_SCALAR2(le, <=, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)(x.v >= y.v);
+}
+KFR_OP_SCALAR2(ge, >=, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)(x.v < y.v);
+}
+KFR_OP_SCALAR2(lt, <, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (simd<T, N>)(x.v > y.v);
+}
+KFR_OP_SCALAR2(gt, >, (simd<T, N>), , )
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/basicoperators_generic.hpp b/include/kfr/simd/impl/basicoperators_generic.hpp
@@ -0,0 +1,1674 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../mask.hpp"
+#include "function.hpp"
+#include <algorithm>
+#include <utility>
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4700))
+CMT_PRAGMA_MSVC(warning(disable : 4309))
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC __m128 _mm_allones_ps()
+{
+    return _mm_castsi128_ps(_mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
+}
+
+KFR_INTRINSIC __m128d _mm_allones_pd()
+{
+    return _mm_castsi128_pd(_mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
+}
+
+KFR_INTRINSIC __m128i _mm_allones_si128() { return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); }
+
+KFR_INTRINSIC __m128 _mm_not_ps(const __m128& x) { return _mm_xor_ps(x, _mm_allones_ps()); }
+
+KFR_INTRINSIC __m128d _mm_not_pd(const __m128d& x) { return _mm_xor_pd(x, _mm_allones_pd()); }
+
+KFR_INTRINSIC __m128i _mm_not_si128(const __m128i& x) { return _mm_xor_si128(x, _mm_allones_si128()); }
+
+KFR_INTRINSIC __m128i _mm_highbit_epi8() { return _mm_set1_epi8(static_cast<char>(0x80)); }
+KFR_INTRINSIC __m128i _mm_highbit_epi16() { return _mm_set1_epi16(static_cast<short>(0x8000)); }
+KFR_INTRINSIC __m128i _mm_highbit_epi32() { return _mm_set1_epi32(static_cast<int>(0x80000000)); }
+KFR_INTRINSIC __m128i _mm_highbit_epi64() { return _mm_set1_epi64x(0x8000000000000000ll); }
+
+KFR_INTRINSIC f32sse add(const f32sse& x, const f32sse& y) { return f32sse(_mm_add_ps(x.v, y.v)); }
+KFR_INTRINSIC f32sse sub(const f32sse& x, const f32sse& y) { return f32sse(_mm_sub_ps(x.v, y.v)); }
+KFR_INTRINSIC f32sse mul(const f32sse& x, const f32sse& y) { return f32sse(_mm_mul_ps(x.v, y.v)); }
+KFR_INTRINSIC f32sse div(const f32sse& x, const f32sse& y) { return f32sse(_mm_div_ps(x.v, y.v)); }
+
+KFR_INTRINSIC f64sse add(const f64sse& x, const f64sse& y) { return f64sse(_mm_add_pd(x.v, y.v)); }
+KFR_INTRINSIC f64sse sub(const f64sse& x, const f64sse& y) { return f64sse(_mm_sub_pd(x.v, y.v)); }
+KFR_INTRINSIC f64sse mul(const f64sse& x, const f64sse& y) { return f64sse(_mm_mul_pd(x.v, y.v)); }
+KFR_INTRINSIC f64sse div(const f64sse& x, const f64sse& y) { return f64sse(_mm_div_pd(x.v, y.v)); }
+
+KFR_INTRINSIC u8sse add(const u8sse& x, const u8sse& y) { return _mm_add_epi8(x.v, y.v); }
+KFR_INTRINSIC u8sse sub(const u8sse& x, const u8sse& y) { return _mm_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC u8sse div(const u8sse& x, const u8sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u8sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i8sse add(const i8sse& x, const i8sse& y) { return _mm_add_epi8(x.v, y.v); }
+KFR_INTRINSIC i8sse sub(const i8sse& x, const i8sse& y) { return _mm_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC i8sse div(const i8sse& x, const i8sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i8sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC __m128i mul_epi8(const __m128i& x, const __m128i& y)
+{
+    const __m128i even = _mm_mullo_epi16(x, y);
+    const __m128i odd  = _mm_mullo_epi16(_mm_srli_epi16(x, 8), _mm_srli_epi16(y, 8));
+    return _mm_or_si128(_mm_slli_epi16(odd, 8), _mm_srli_epi16(_mm_slli_epi16(even, 8), 8));
+}
+
+KFR_INTRINSIC u8sse mul(const u8sse& x, const u8sse& y) { return mul_epi8(x.v, y.v); }
+
+KFR_INTRINSIC i8sse mul(const i8sse& x, const i8sse& y) { return mul_epi8(x.v, y.v); }
+
+KFR_INTRINSIC u16sse add(const u16sse& x, const u16sse& y) { return _mm_add_epi16(x.v, y.v); }
+KFR_INTRINSIC u16sse sub(const u16sse& x, const u16sse& y) { return _mm_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC u16sse mul(const u16sse& x, const u16sse& y) { return _mm_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC u16sse div(const u16sse& x, const u16sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u16sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i16sse add(const i16sse& x, const i16sse& y) { return _mm_add_epi16(x.v, y.v); }
+KFR_INTRINSIC i16sse sub(const i16sse& x, const i16sse& y) { return _mm_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC i16sse mul(const i16sse& x, const i16sse& y) { return _mm_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC i16sse div(const i16sse& x, const i16sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i16sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC u32sse add(const u32sse& x, const u32sse& y) { return _mm_add_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse sub(const u32sse& x, const u32sse& y) { return _mm_sub_epi32(x.v, y.v); }
+
+KFR_INTRINSIC i32sse add(const i32sse& x, const i32sse& y) { return _mm_add_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse sub(const i32sse& x, const i32sse& y) { return _mm_sub_epi32(x.v, y.v); }
+
+#if defined CMT_ARCH_SSE41
+KFR_INTRINSIC u32sse mul(const u32sse& x, const u32sse& y) { return _mm_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse mul(const i32sse& x, const i32sse& y) { return _mm_mullo_epi32(x.v, y.v); }
+#else
+KFR_INTRINSIC u32sse mul(const u32sse& x, const u32sse& y)
+{
+    __m128i tmp1 = _mm_mul_epu32(x.v, y.v);
+    __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x.v, 4), _mm_srli_si128(y.v, 4));
+    return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),
+                              _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+KFR_INTRINSIC i32sse mul(const i32sse& x, const i32sse& y)
+{
+    __m128i tmp1 = _mm_mul_epu32(x.v, y.v);
+    __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x.v, 4), _mm_srli_si128(y.v, 4));
+    return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),
+                              _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+#endif
+KFR_INTRINSIC u32sse div(const u32sse& x, const u32sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u32sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i32sse div(const i32sse& x, const i32sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i32sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC u64sse add(const u64sse& x, const u64sse& y) { return _mm_add_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse sub(const u64sse& x, const u64sse& y) { return _mm_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse mul(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC u64sse div(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i64sse add(const i64sse& x, const i64sse& y) { return _mm_add_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse sub(const i64sse& x, const i64sse& y) { return _mm_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse mul(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC i64sse div(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC f32sse shl(const f32sse& x, unsigned y)
+{
+    return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(x.v), y));
+}
+KFR_INTRINSIC f64sse shl(const f64sse& x, unsigned y)
+{
+    return _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(x.v), y));
+}
+KFR_INTRINSIC f32sse shr(const f32sse& x, unsigned y)
+{
+    return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(x.v), y));
+}
+KFR_INTRINSIC f64sse shr(const f64sse& x, unsigned y)
+{
+    return _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(x.v), y));
+}
+
+KFR_INTRINSIC u16sse shl(const u16sse& x, unsigned y) { return _mm_slli_epi16(x.v, y); }
+KFR_INTRINSIC u32sse shl(const u32sse& x, unsigned y) { return _mm_slli_epi32(x.v, y); }
+KFR_INTRINSIC u64sse shl(const u64sse& x, unsigned y) { return _mm_slli_epi64(x.v, y); }
+KFR_INTRINSIC i16sse shl(const i16sse& x, unsigned y) { return _mm_slli_epi16(x.v, y); }
+KFR_INTRINSIC i32sse shl(const i32sse& x, unsigned y) { return _mm_slli_epi32(x.v, y); }
+KFR_INTRINSIC i64sse shl(const i64sse& x, unsigned y) { return _mm_slli_epi64(x.v, y); }
+
+KFR_INTRINSIC u16sse shr(const u16sse& x, unsigned y) { return _mm_srli_epi16(x.v, y); }
+KFR_INTRINSIC u32sse shr(const u32sse& x, unsigned y) { return _mm_srli_epi32(x.v, y); }
+KFR_INTRINSIC u64sse shr(const u64sse& x, unsigned y) { return _mm_srli_epi64(x.v, y); }
+KFR_INTRINSIC i16sse shr(const i16sse& x, unsigned y) { return _mm_srai_epi16(x.v, y); }
+KFR_INTRINSIC i32sse shr(const i32sse& x, unsigned y) { return _mm_srai_epi32(x.v, y); }
+
+KFR_INTRINSIC u8sse shl(const u8sse& x, unsigned y)
+{
+    __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+    __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+    __m128i ll = _mm_slli_epi16(l, y);
+    __m128i hh = _mm_slli_epi16(h, y);
+
+    return _mm_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8sse shl(const i8sse& x, unsigned y)
+{
+    __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+    __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+    __m128i ll = _mm_slli_epi16(l, y);
+    __m128i hh = _mm_slli_epi16(h, y);
+
+    return _mm_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC u8sse shr(const u8sse& x, unsigned y)
+{
+    __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+    __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+    __m128i ll = _mm_srli_epi16(l, y);
+    __m128i hh = _mm_srli_epi16(h, y);
+
+    return _mm_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8sse shr(const i8sse& x, unsigned y)
+{
+    __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+    __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+    __m128i ll = _mm_srai_epi16(l, y);
+    __m128i hh = _mm_srai_epi16(h, y);
+
+    return _mm_packs_epi16(ll, hh);
+}
+
+KFR_INTRINSIC i64sse shr(const i64sse& x, unsigned y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = x[i] >> y);
+}
+
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y[i])));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y[i])));
+}
+
+KFR_INTRINSIC f32sse band(const f32sse& x, const f32sse& y) { return _mm_and_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse band(const f64sse& x, const f64sse& y) { return _mm_and_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8sse band(const u8sse& x, const u8sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC u16sse band(const u16sse& x, const u16sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC u32sse band(const u32sse& x, const u32sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC u64sse band(const u64sse& x, const u64sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i8sse band(const i8sse& x, const i8sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i16sse band(const i16sse& x, const i16sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i32sse band(const i32sse& x, const i32sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i64sse band(const i64sse& x, const i64sse& y) { return _mm_and_si128(x.v, y.v); }
+
+KFR_INTRINSIC f32sse bor(const f32sse& x, const f32sse& y) { return _mm_or_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse bor(const f64sse& x, const f64sse& y) { return _mm_or_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8sse bor(const u8sse& x, const u8sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC u16sse bor(const u16sse& x, const u16sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC u32sse bor(const u32sse& x, const u32sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC u64sse bor(const u64sse& x, const u64sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i8sse bor(const i8sse& x, const i8sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i16sse bor(const i16sse& x, const i16sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i32sse bor(const i32sse& x, const i32sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i64sse bor(const i64sse& x, const i64sse& y) { return _mm_or_si128(x.v, y.v); }
+
+KFR_INTRINSIC f32sse bxor(const f32sse& x, const f32sse& y) { return _mm_xor_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse bxor(const f64sse& x, const f64sse& y) { return _mm_xor_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8sse bxor(const u8sse& x, const u8sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC u16sse bxor(const u16sse& x, const u16sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC u32sse bxor(const u32sse& x, const u32sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC u64sse bxor(const u64sse& x, const u64sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i8sse bxor(const i8sse& x, const i8sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i16sse bxor(const i16sse& x, const i16sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i32sse bxor(const i32sse& x, const i32sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i64sse bxor(const i64sse& x, const i64sse& y) { return _mm_xor_si128(x.v, y.v); }
+
+KFR_INTRINSIC f32sse eq(const f32sse& x, const f32sse& y) { return _mm_cmpeq_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse eq(const f64sse& x, const f64sse& y) { return _mm_cmpeq_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse eq(const u8sse& x, const u8sse& y) { return _mm_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse eq(const u16sse& x, const u16sse& y) { return _mm_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC u32sse eq(const u32sse& x, const u32sse& y) { return _mm_cmpeq_epi32(x.v, y.v); }
+KFR_INTRINSIC i8sse eq(const i8sse& x, const i8sse& y) { return _mm_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC i16sse eq(const i16sse& x, const i16sse& y) { return _mm_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC i32sse eq(const i32sse& x, const i32sse& y) { return _mm_cmpeq_epi32(x.v, y.v); }
+
+KFR_INTRINSIC f32sse ne(const f32sse& x, const f32sse& y) { return _mm_not_ps(_mm_cmpeq_ps(x.v, y.v)); }
+KFR_INTRINSIC f64sse ne(const f64sse& x, const f64sse& y) { return _mm_not_pd(_mm_cmpeq_pd(x.v, y.v)); }
+KFR_INTRINSIC u8sse ne(const u8sse& x, const u8sse& y) { return _mm_not_si128(_mm_cmpeq_epi8(x.v, y.v)); }
+KFR_INTRINSIC u16sse ne(const u16sse& x, const u16sse& y) { return _mm_not_si128(_mm_cmpeq_epi16(x.v, y.v)); }
+KFR_INTRINSIC u32sse ne(const u32sse& x, const u32sse& y) { return _mm_not_si128(_mm_cmpeq_epi32(x.v, y.v)); }
+KFR_INTRINSIC i8sse ne(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmpeq_epi8(x.v, y.v)); }
+KFR_INTRINSIC i16sse ne(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmpeq_epi16(x.v, y.v)); }
+KFR_INTRINSIC i32sse ne(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmpeq_epi32(x.v, y.v)); }
+
+KFR_INTRINSIC f32sse lt(const f32sse& x, const f32sse& y) { return _mm_cmplt_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse lt(const f64sse& x, const f64sse& y) { return _mm_cmplt_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse lt(const i8sse& x, const i8sse& y) { return _mm_cmplt_epi8(x.v, y.v); }
+KFR_INTRINSIC i16sse lt(const i16sse& x, const i16sse& y) { return _mm_cmplt_epi16(x.v, y.v); }
+KFR_INTRINSIC i32sse lt(const i32sse& x, const i32sse& y) { return _mm_cmplt_epi32(x.v, y.v); }
+
+KFR_INTRINSIC u8sse lt(const u8sse& x, const u8sse& y)
+{
+    const __m128i hb = _mm_highbit_epi8();
+    return _mm_cmplt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb));
+}
+
+KFR_INTRINSIC u16sse lt(const u16sse& x, const u16sse& y)
+{
+    const __m128i hb = _mm_highbit_epi16();
+    return _mm_cmplt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb));
+}
+KFR_INTRINSIC u32sse lt(const u32sse& x, const u32sse& y)
+{
+    const __m128i hb = _mm_highbit_epi32();
+    return _mm_cmplt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb));
+}
+
+KFR_INTRINSIC f32sse gt(const f32sse& x, const f32sse& y) { return _mm_cmpgt_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse gt(const f64sse& x, const f64sse& y) { return _mm_cmpgt_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse gt(const i8sse& x, const i8sse& y) { return _mm_cmpgt_epi8(x.v, y.v); }
+KFR_INTRINSIC i16sse gt(const i16sse& x, const i16sse& y) { return _mm_cmpgt_epi16(x.v, y.v); }
+KFR_INTRINSIC i32sse gt(const i32sse& x, const i32sse& y) { return _mm_cmpgt_epi32(x.v, y.v); }
+
+KFR_INTRINSIC u8sse gt(const u8sse& x, const u8sse& y)
+{
+    const __m128i hb = _mm_highbit_epi8();
+    return _mm_cmpgt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb));
+}
+
+KFR_INTRINSIC u16sse gt(const u16sse& x, const u16sse& y)
+{
+    const __m128i hb = _mm_highbit_epi16();
+    return _mm_cmpgt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb));
+}
+KFR_INTRINSIC u32sse gt(const u32sse& x, const u32sse& y)
+{
+    const __m128i hb = _mm_highbit_epi32();
+    return _mm_cmpgt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb));
+}
+
+KFR_INTRINSIC f32sse le(const f32sse& x, const f32sse& y) { return _mm_cmple_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse le(const f64sse& x, const f64sse& y) { return _mm_cmple_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse le(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmpgt_epi8(x.v, y.v)); }
+KFR_INTRINSIC i16sse le(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmpgt_epi16(x.v, y.v)); }
+KFR_INTRINSIC i32sse le(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmpgt_epi32(x.v, y.v)); }
+
+KFR_INTRINSIC u8sse le(const u8sse& x, const u8sse& y)
+{
+    const __m128i hb = _mm_highbit_epi8();
+    return _mm_not_si128(_mm_cmpgt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb)));
+}
+
+KFR_INTRINSIC u16sse le(const u16sse& x, const u16sse& y)
+{
+    const __m128i hb = _mm_highbit_epi16();
+    return _mm_not_si128(_mm_cmpgt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb)));
+}
+KFR_INTRINSIC u32sse le(const u32sse& x, const u32sse& y)
+{
+    const __m128i hb = _mm_highbit_epi32();
+    return _mm_not_si128(_mm_cmpgt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb)));
+}
+
+KFR_INTRINSIC f32sse ge(const f32sse& x, const f32sse& y) { return _mm_cmpge_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse ge(const f64sse& x, const f64sse& y) { return _mm_cmpge_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse ge(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmplt_epi8(x.v, y.v)); }
+KFR_INTRINSIC i16sse ge(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmplt_epi16(x.v, y.v)); }
+KFR_INTRINSIC i32sse ge(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmplt_epi32(x.v, y.v)); }
+
+KFR_INTRINSIC u8sse ge(const u8sse& x, const u8sse& y)
+{
+    const __m128i hb = _mm_highbit_epi8();
+    return _mm_not_si128(_mm_cmplt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb)));
+}
+
+KFR_INTRINSIC u16sse ge(const u16sse& x, const u16sse& y)
+{
+    const __m128i hb = _mm_highbit_epi16();
+    return _mm_not_si128(_mm_cmplt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb)));
+}
+KFR_INTRINSIC u32sse ge(const u32sse& x, const u32sse& y)
+{
+    const __m128i hb = _mm_highbit_epi32();
+    return _mm_not_si128(_mm_cmplt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb)));
+}
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y) { return _mm_cmpeq_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y) { return _mm_cmpeq_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y) { return _mm_not_si128(_mm_cmpeq_epi64(x.v, y.v)); }
+KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpeq_epi64(x.v, y.v)); }
+#else
+KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] == y[i]));
+}
+KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] == y[i]));
+}
+KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] != y[i]));
+}
+KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] != y[i]));
+}
+#endif
+
+#if defined CMT_ARCH_SSE42
+KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y) { return _mm_cmpgt_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y) { return _mm_cmpgt_epi64(y.v, x.v); }
+KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpgt_epi64(y.v, x.v)); }
+KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpgt_epi64(x.v, y.v)); }
+
+KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y)
+{
+    const __m128i hb = _mm_highbit_epi64();
+    return _mm_cmpgt_epi64(_mm_add_epi64(x.v, hb), _mm_add_epi64(y.v, hb));
+}
+KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y)
+{
+    const __m128i hb = _mm_highbit_epi64();
+    return _mm_cmpgt_epi64(_mm_add_epi64(y.v, hb), _mm_add_epi64(x.v, hb));
+}
+KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y)
+{
+    const __m128i hb = _mm_highbit_epi64();
+    return _mm_not_si128(_mm_cmpgt_epi64(_mm_add_epi64(y.v, hb), _mm_add_epi64(x.v, hb)));
+}
+KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
+{
+    const __m128i hb = _mm_highbit_epi64();
+    return _mm_not_si128(_mm_cmpgt_epi64(_mm_add_epi64(x.v, hb), _mm_add_epi64(y.v, hb)));
+}
+
+#else
+KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] > y[i]));
+}
+KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] > y[i]));
+}
+KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] < y[i]));
+}
+KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] < y[i]));
+}
+KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] >= y[i]));
+}
+KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] >= y[i]));
+}
+KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] <= y[i]));
+}
+KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] <= y[i]));
+}
+#endif
+
+#if defined CMT_ARCH_AVX
+
+KFR_INTRINSIC f32avx add(const f32avx& x, const f32avx& y) { return f32avx(_mm256_add_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx add(const f64avx& x, const f64avx& y) { return f64avx(_mm256_add_pd(x.v, y.v)); }
+KFR_INTRINSIC f32avx sub(const f32avx& x, const f32avx& y) { return f32avx(_mm256_sub_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx sub(const f64avx& x, const f64avx& y) { return f64avx(_mm256_sub_pd(x.v, y.v)); }
+KFR_INTRINSIC f32avx mul(const f32avx& x, const f32avx& y) { return f32avx(_mm256_mul_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx mul(const f64avx& x, const f64avx& y) { return f64avx(_mm256_mul_pd(x.v, y.v)); }
+KFR_INTRINSIC f32avx div(const f32avx& x, const f32avx& y) { return f32avx(_mm256_div_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx div(const f64avx& x, const f64avx& y) { return f64avx(_mm256_div_pd(x.v, y.v)); }
+
+KFR_INTRINSIC __m256 _mm256_allones_ps()
+{
+    return _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_UQ);
+}
+
+KFR_INTRINSIC __m256d _mm256_allones_pd()
+{
+    return _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_UQ);
+}
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC __m256i _mm256_allones_si256()
+{
+    return _mm256_cmpeq_epi8(_mm256_setzero_si256(), _mm256_setzero_si256());
+}
+#else
+KFR_INTRINSIC __m256i _mm256_allones_si256()
+{
+    return _mm256_castps_si256(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_UQ));
+}
+#endif
+
+KFR_INTRINSIC __m256 _mm256_not_ps(const __m256& x) { return _mm256_xor_ps(x, _mm256_allones_ps()); }
+KFR_INTRINSIC __m256d _mm256_not_pd(const __m256d& x) { return _mm256_xor_pd(x, _mm256_allones_pd()); }
+KFR_INTRINSIC __m256i _mm256_not_si256(const __m256i& x)
+{
+    return _mm256_xor_si256(x, _mm256_allones_si256());
+}
+
+KFR_INTRINSIC __m256i _mm256_highbit_epi8() { return _mm256_set1_epi8(static_cast<char>(0x80)); }
+KFR_INTRINSIC __m256i _mm256_highbit_epi16() { return _mm256_set1_epi16(static_cast<short>(0x8000)); }
+KFR_INTRINSIC __m256i _mm256_highbit_epi32() { return _mm256_set1_epi32(static_cast<int>(0x80000000)); }
+KFR_INTRINSIC __m256i _mm256_highbit_epi64() { return _mm256_set1_epi64x(0x8000000000000000ll); }
+
+KFR_INTRINSIC f32avx eq(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_EQ_OQ); }
+KFR_INTRINSIC f64avx eq(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_EQ_OQ); }
+KFR_INTRINSIC f32avx ne(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_NEQ_OQ); }
+KFR_INTRINSIC f64avx ne(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_NEQ_OQ); }
+KFR_INTRINSIC f32avx lt(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_LT_OQ); }
+KFR_INTRINSIC f64avx lt(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_LT_OQ); }
+KFR_INTRINSIC f32avx gt(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_GT_OQ); }
+KFR_INTRINSIC f64avx gt(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_GT_OQ); }
+KFR_INTRINSIC f32avx le(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_LE_OQ); }
+KFR_INTRINSIC f64avx le(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_LE_OQ); }
+KFR_INTRINSIC f32avx ge(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_GE_OQ); }
+KFR_INTRINSIC f64avx ge(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_GE_OQ); }
+
+KFR_INTRINSIC f32avx band(const f32avx& x, const f32avx& y) { return _mm256_and_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx band(const f64avx& x, const f64avx& y) { return _mm256_and_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx bor(const f32avx& x, const f32avx& y) { return _mm256_or_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx bor(const f64avx& x, const f64avx& y) { return _mm256_or_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx bxor(const f32avx& x, const f32avx& y) { return _mm256_xor_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx bxor(const f64avx& x, const f64avx& y) { return _mm256_xor_pd(x.v, y.v); }
+
+KFR_INTRINSIC f32avx shl(const f32avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+    return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(x.v), y));
+#else
+    return _mm256_setr_m128(
+        _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(x.v)), y)),
+        _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(x.v, 1)), y)));
+#endif
+}
+KFR_INTRINSIC f64avx shl(const f64avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+    return _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_castpd_si256(x.v), y));
+#else
+    return _mm256_setr_m128d(
+        _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(_mm256_castpd256_pd128(x.v)), y)),
+        _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(_mm256_extractf128_pd(x.v, 1)), y)));
+#endif
+}
+KFR_INTRINSIC f32avx shr(const f32avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+    return _mm256_castsi256_ps(_mm256_srli_epi32(_mm256_castps_si256(x.v), y));
+#else
+    return _mm256_setr_m128(
+        _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(_mm256_castps256_ps128(x.v)), y)),
+        _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(_mm256_extractf128_ps(x.v, 1)), y)));
+#endif
+}
+KFR_INTRINSIC f64avx shr(const f64avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+    return _mm256_castsi256_pd(_mm256_srli_epi64(_mm256_castpd_si256(x.v), y));
+#else
+    return _mm256_setr_m128d(
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(_mm256_castpd256_pd128(x.v)), y)),
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(_mm256_extractf128_pd(x.v, 1)), y)));
+#endif
+}
+
+#if defined CMT_ARCH_AVX2
+
+KFR_INTRINSIC u8avx add(const u8avx& x, const u8avx& y) { return _mm256_add_epi8(x.v, y.v); }
+KFR_INTRINSIC u8avx sub(const u8avx& x, const u8avx& y) { return _mm256_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC u8avx div(const u8avx& x, const u8avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(u8avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC i8avx add(const i8avx& x, const i8avx& y) { return _mm256_add_epi8(x.v, y.v); }
+KFR_INTRINSIC i8avx sub(const i8avx& x, const i8avx& y) { return _mm256_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC i8avx div(const i8avx& x, const i8avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(i8avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC u16avx add(const u16avx& x, const u16avx& y) { return _mm256_add_epi16(x.v, y.v); }
+KFR_INTRINSIC u16avx sub(const u16avx& x, const u16avx& y) { return _mm256_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC u16avx mul(const u16avx& x, const u16avx& y) { return _mm256_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC u16avx div(const u16avx& x, const u16avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(u16avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC i16avx add(const i16avx& x, const i16avx& y) { return _mm256_add_epi16(x.v, y.v); }
+KFR_INTRINSIC i16avx sub(const i16avx& x, const i16avx& y) { return _mm256_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC i16avx mul(const i16avx& x, const i16avx& y) { return _mm256_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC i16avx div(const i16avx& x, const i16avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(i16avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC u32avx add(const u32avx& x, const u32avx& y) { return _mm256_add_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx sub(const u32avx& x, const u32avx& y) { return _mm256_sub_epi32(x.v, y.v); }
+
+KFR_INTRINSIC i32avx add(const i32avx& x, const i32avx& y) { return _mm256_add_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx sub(const i32avx& x, const i32avx& y) { return _mm256_sub_epi32(x.v, y.v); }
+
+KFR_INTRINSIC u32avx mul(const u32avx& x, const u32avx& y) { return _mm256_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx mul(const i32avx& x, const i32avx& y) { return _mm256_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx div(const u32avx& x, const u32avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(u32avx, result[i] = x[i] / y[i]);
+}
+KFR_INTRINSIC i32avx div(const i32avx& x, const i32avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(i32avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC u64avx add(const u64avx& x, const u64avx& y) { return _mm256_add_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx sub(const u64avx& x, const u64avx& y) { return _mm256_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx mul(const u64avx& x, const u64avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64avx, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC u64avx div(const u64avx& x, const u64avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64avx, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i64avx add(const i64avx& x, const i64avx& y) { return _mm256_add_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx sub(const i64avx& x, const i64avx& y) { return _mm256_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx mul(const i64avx& x, const i64avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64avx, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC i64avx div(const i64avx& x, const i64avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64avx, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC __m256i mul_epi8(const __m256i& x, const __m256i& y)
+{
+    const __m256i even = _mm256_mullo_epi16(x, y);
+    const __m256i odd  = _mm256_mullo_epi16(_mm256_srli_epi16(x, 8), _mm256_srli_epi16(y, 8));
+    return _mm256_or_si256(_mm256_slli_epi16(odd, 8), _mm256_srli_epi16(_mm256_slli_epi16(even, 8), 8));
+}
+
+KFR_INTRINSIC u8avx mul(const u8avx& x, const u8avx& y) { return mul_epi8(x.v, y.v); }
+KFR_INTRINSIC i8avx mul(const i8avx& x, const i8avx& y) { return mul_epi8(x.v, y.v); }
+
+KFR_INTRINSIC u8avx band(const u8avx& x, const u8avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u16avx band(const u16avx& x, const u16avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u32avx band(const u32avx& x, const u32avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u64avx band(const u64avx& x, const u64avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i8avx band(const i8avx& x, const i8avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i16avx band(const i16avx& x, const i16avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i32avx band(const i32avx& x, const i32avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i64avx band(const i64avx& x, const i64avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u8avx bor(const u8avx& x, const u8avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u16avx bor(const u16avx& x, const u16avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u32avx bor(const u32avx& x, const u32avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u64avx bor(const u64avx& x, const u64avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i8avx bor(const i8avx& x, const i8avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i16avx bor(const i16avx& x, const i16avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i32avx bor(const i32avx& x, const i32avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i64avx bor(const i64avx& x, const i64avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u8avx bxor(const u8avx& x, const u8avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC u16avx bxor(const u16avx& x, const u16avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC u32avx bxor(const u32avx& x, const u32avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC u64avx bxor(const u64avx& x, const u64avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i8avx bxor(const i8avx& x, const i8avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i16avx bxor(const i16avx& x, const i16avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i32avx bxor(const i32avx& x, const i32avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i64avx bxor(const i64avx& x, const i64avx& y) { return _mm256_xor_si256(x.v, y.v); }
+
+KFR_INTRINSIC u16avx shl(const u16avx& x, unsigned y) { return _mm256_slli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx shl(const u32avx& x, unsigned y) { return _mm256_slli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx shl(const i16avx& x, unsigned y) { return _mm256_slli_epi16(x.v, y); }
+KFR_INTRINSIC i32avx shl(const i32avx& x, unsigned y) { return _mm256_slli_epi32(x.v, y); }
+KFR_INTRINSIC u16avx shr(const u16avx& x, unsigned y) { return _mm256_srli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx shr(const u32avx& x, unsigned y) { return _mm256_srli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx shr(const i16avx& x, unsigned y) { return _mm256_srai_epi16(x.v, y); }
+KFR_INTRINSIC i32avx shr(const i32avx& x, unsigned y) { return _mm256_srai_epi32(x.v, y); }
+
+KFR_INTRINSIC u64avx shl(const u64avx& x, unsigned y) { return _mm256_slli_epi64(x.v, y); }
+KFR_INTRINSIC u64avx shr(const u64avx& x, unsigned y) { return _mm256_srli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx shl(const i64avx& x, unsigned y) { return _mm256_slli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx shr(const i64avx& x, unsigned y)
+{
+    KFR_COMPONENTWISE_RET_I(u64avx, result[i] = x[i] >> y);
+}
+
+KFR_INTRINSIC u8avx shl(const u8avx& x, unsigned y)
+{
+    __m256i l  = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+    __m256i h  = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+    __m256i ll = _mm256_slli_epi16(l, y);
+    __m256i hh = _mm256_slli_epi16(h, y);
+
+    return _mm256_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx shl(const i8avx& x, unsigned y)
+{
+    __m256i l  = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+    __m256i h  = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+    __m256i ll = _mm256_slli_epi16(l, y);
+    __m256i hh = _mm256_slli_epi16(h, y);
+
+    return _mm256_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC u8avx shr(const u8avx& x, unsigned y)
+{
+    __m256i l  = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+    __m256i h  = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+    __m256i ll = _mm256_srli_epi16(l, y);
+    __m256i hh = _mm256_srli_epi16(h, y);
+
+    return _mm256_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx shr(const i8avx& x, unsigned y)
+{
+    __m256i l  = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+    __m256i h  = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+    __m256i ll = _mm256_srai_epi16(l, y);
+    __m256i hh = _mm256_srai_epi16(h, y);
+
+    return _mm256_packs_epi16(ll, hh);
+}
+
+KFR_INTRINSIC u32sse shl(const u32sse& x, const u32sse& y) { return _mm_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse shl(const i32sse& x, const u32sse& y) { return _mm_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC u64sse shl(const u64sse& x, const u64sse& y) { return _mm_sllv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse shl(const i64sse& x, const u64sse& y) { return _mm_sllv_epi64(x.v, y.v); }
+
+KFR_INTRINSIC u32avx shl(const u32avx& x, const u32avx& y) { return _mm256_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx shl(const i32avx& x, const u32avx& y) { return _mm256_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx shl(const u64avx& x, const u64avx& y) { return _mm256_sllv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx shl(const i64avx& x, const u64avx& y) { return _mm256_sllv_epi64(x.v, y.v); }
+
+KFR_INTRINSIC u32sse shr(const u32sse& x, const u32sse& y) { return _mm_srlv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse shr(const i32sse& x, const u32sse& y) { return _mm_srav_epi32(x.v, y.v); }
+KFR_INTRINSIC u64sse shr(const u64sse& x, const u64sse& y) { return _mm_srlv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse shr(const i64sse& x, const u64sse& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64sse, result[i] = x[i] >> y[i]);
+}
+
+KFR_INTRINSIC u32avx shr(const u32avx& x, const u32avx& y) { return _mm256_srlv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx shr(const i32avx& x, const u32avx& y) { return _mm256_srav_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx shr(const u64avx& x, const u64avx& y) { return _mm256_srlv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx shr(const i64avx& x, const u64avx& y)
+{
+    KFR_COMPONENTWISE_RET_I(i64avx, result[i] = x[i] >> y[i]);
+}
+
+KFR_INTRINSIC f32sse shl(const f32sse& x, const u32sse& y)
+{
+    return _mm_castsi128_ps(_mm_sllv_epi32(_mm_castps_si128(x.v), y.v));
+}
+KFR_INTRINSIC f64sse shl(const f64sse& x, const u64sse& y)
+{
+    return _mm_castsi128_pd(_mm_sllv_epi64(_mm_castpd_si128(x.v), y.v));
+}
+KFR_INTRINSIC f32sse shr(const f32sse& x, const u32sse& y)
+{
+    return _mm_castsi128_ps(_mm_srlv_epi32(_mm_castps_si128(x.v), y.v));
+}
+KFR_INTRINSIC f64sse shr(const f64sse& x, const u64sse& y)
+{
+    return _mm_castsi128_pd(_mm_srlv_epi64(_mm_castpd_si128(x.v), y.v));
+}
+
+KFR_INTRINSIC f32avx shl(const f32avx& x, const u32avx& y)
+{
+    return _mm256_castsi256_ps(_mm256_sllv_epi32(_mm256_castps_si256(x.v), y.v));
+}
+KFR_INTRINSIC f64avx shl(const f64avx& x, const u64avx& y)
+{
+    return _mm256_castsi256_pd(_mm256_sllv_epi64(_mm256_castpd_si256(x.v), y.v));
+}
+KFR_INTRINSIC f32avx shr(const f32avx& x, const u32avx& y)
+{
+    return _mm256_castsi256_ps(_mm256_srlv_epi32(_mm256_castps_si256(x.v), y.v));
+}
+KFR_INTRINSIC f64avx shr(const f64avx& x, const u64avx& y)
+{
+    return _mm256_castsi256_pd(_mm256_srlv_epi64(_mm256_castpd_si256(x.v), y.v));
+}
+
+KFR_INTRINSIC i8avx eq(const i8avx& x, const i8avx& y) { return _mm256_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx eq(const i16avx& x, const i16avx& y) { return _mm256_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx eq(const i32avx& x, const i32avx& y) { return _mm256_cmpeq_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx eq(const i64avx& x, const i64avx& y) { return _mm256_cmpeq_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx eq(const u8avx& x, const u8avx& y) { return _mm256_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx eq(const u16avx& x, const u16avx& y) { return _mm256_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx eq(const u32avx& x, const u32avx& y) { return _mm256_cmpeq_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx eq(const u64avx& x, const u64avx& y) { return _mm256_cmpeq_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx ne(const i8avx& x, const i8avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi8(x.v, y.v));
+}
+KFR_INTRINSIC i16avx ne(const i16avx& x, const i16avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi16(x.v, y.v));
+}
+KFR_INTRINSIC i32avx ne(const i32avx& x, const i32avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi32(x.v, y.v));
+}
+KFR_INTRINSIC i64avx ne(const i64avx& x, const i64avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi64(x.v, y.v));
+}
+KFR_INTRINSIC u8avx ne(const u8avx& x, const u8avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi8(x.v, y.v));
+}
+KFR_INTRINSIC u16avx ne(const u16avx& x, const u16avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi16(x.v, y.v));
+}
+KFR_INTRINSIC u32avx ne(const u32avx& x, const u32avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi32(x.v, y.v));
+}
+KFR_INTRINSIC u64avx ne(const u64avx& x, const u64avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpeq_epi64(x.v, y.v));
+}
+
+KFR_INTRINSIC i8avx lt(const i8avx& x, const i8avx& y) { return _mm256_cmpgt_epi8(y.v, x.v); }
+KFR_INTRINSIC i16avx lt(const i16avx& x, const i16avx& y) { return _mm256_cmpgt_epi16(y.v, x.v); }
+KFR_INTRINSIC i32avx lt(const i32avx& x, const i32avx& y) { return _mm256_cmpgt_epi32(y.v, x.v); }
+KFR_INTRINSIC i64avx lt(const i64avx& x, const i64avx& y) { return _mm256_cmpgt_epi64(y.v, x.v); }
+
+KFR_INTRINSIC i8avx gt(const i8avx& x, const i8avx& y) { return _mm256_cmpgt_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx gt(const i16avx& x, const i16avx& y) { return _mm256_cmpgt_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx gt(const i32avx& x, const i32avx& y) { return _mm256_cmpgt_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx gt(const i64avx& x, const i64avx& y) { return _mm256_cmpgt_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx le(const i8avx& x, const i8avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi8(x.v, y.v));
+}
+KFR_INTRINSIC i16avx le(const i16avx& x, const i16avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi16(x.v, y.v));
+}
+KFR_INTRINSIC i32avx le(const i32avx& x, const i32avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi32(x.v, y.v));
+}
+KFR_INTRINSIC i64avx le(const i64avx& x, const i64avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi64(x.v, y.v));
+}
+
+KFR_INTRINSIC i8avx ge(const i8avx& x, const i8avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi8(y.v, x.v));
+}
+KFR_INTRINSIC i16avx ge(const i16avx& x, const i16avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi16(y.v, x.v));
+}
+KFR_INTRINSIC i32avx ge(const i32avx& x, const i32avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi32(y.v, x.v));
+}
+KFR_INTRINSIC i64avx ge(const i64avx& x, const i64avx& y)
+{
+    return _mm256_not_si256(_mm256_cmpgt_epi64(y.v, x.v));
+}
+
+KFR_INTRINSIC u8avx lt(const u8avx& x, const u8avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi8();
+    return _mm256_cmpgt_epi8(_mm256_add_epi8(y.v, hb), _mm256_add_epi8(x.v, hb));
+}
+KFR_INTRINSIC u16avx lt(const u16avx& x, const u16avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi16();
+    return _mm256_cmpgt_epi16(_mm256_add_epi16(y.v, hb), _mm256_add_epi16(x.v, hb));
+}
+KFR_INTRINSIC u32avx lt(const u32avx& x, const u32avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi32();
+    return _mm256_cmpgt_epi32(_mm256_add_epi32(y.v, hb), _mm256_add_epi32(x.v, hb));
+}
+KFR_INTRINSIC u64avx lt(const u64avx& x, const u64avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi64();
+    return _mm256_cmpgt_epi64(_mm256_add_epi64(y.v, hb), _mm256_add_epi64(x.v, hb));
+}
+KFR_INTRINSIC u8avx gt(const u8avx& x, const u8avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi8();
+    return _mm256_cmpgt_epi8(_mm256_add_epi8(x.v, hb), _mm256_add_epi8(y.v, hb));
+}
+KFR_INTRINSIC u16avx gt(const u16avx& x, const u16avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi16();
+    return _mm256_cmpgt_epi16(_mm256_add_epi16(x.v, hb), _mm256_add_epi16(y.v, hb));
+}
+KFR_INTRINSIC u32avx gt(const u32avx& x, const u32avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi32();
+    return _mm256_cmpgt_epi32(_mm256_add_epi32(x.v, hb), _mm256_add_epi32(y.v, hb));
+}
+KFR_INTRINSIC u64avx gt(const u64avx& x, const u64avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi64();
+    return _mm256_cmpgt_epi64(_mm256_add_epi64(x.v, hb), _mm256_add_epi64(y.v, hb));
+}
+KFR_INTRINSIC u8avx le(const u8avx& x, const u8avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi8();
+    return _mm256_not_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(x.v, hb), _mm256_add_epi8(y.v, hb)));
+}
+KFR_INTRINSIC u16avx le(const u16avx& x, const u16avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi16();
+    return _mm256_not_si256(_mm256_cmpgt_epi16(_mm256_add_epi16(x.v, hb), _mm256_add_epi16(y.v, hb)));
+}
+KFR_INTRINSIC u32avx le(const u32avx& x, const u32avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi32();
+    return _mm256_not_si256(_mm256_cmpgt_epi32(_mm256_add_epi32(x.v, hb), _mm256_add_epi32(y.v, hb)));
+}
+KFR_INTRINSIC u64avx le(const u64avx& x, const u64avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi64();
+    return _mm256_not_si256(_mm256_cmpgt_epi64(_mm256_add_epi64(x.v, hb), _mm256_add_epi64(y.v, hb)));
+}
+KFR_INTRINSIC u8avx ge(const u8avx& x, const u8avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi8();
+    return _mm256_not_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(y.v, hb), _mm256_add_epi8(x.v, hb)));
+}
+KFR_INTRINSIC u16avx ge(const u16avx& x, const u16avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi16();
+    return _mm256_not_si256(_mm256_cmpgt_epi16(_mm256_add_epi16(y.v, hb), _mm256_add_epi16(x.v, hb)));
+}
+KFR_INTRINSIC u32avx ge(const u32avx& x, const u32avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi32();
+    return _mm256_not_si256(_mm256_cmpgt_epi32(_mm256_add_epi32(y.v, hb), _mm256_add_epi32(x.v, hb)));
+}
+KFR_INTRINSIC u64avx ge(const u64avx& x, const u64avx& y)
+{
+    const __m256i hb = _mm256_highbit_epi64();
+    return _mm256_not_si256(_mm256_cmpgt_epi64(_mm256_add_epi64(y.v, hb), _mm256_add_epi64(x.v, hb)));
+}
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 add(const f32avx512& x, const f32avx512& y) { return _mm512_add_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 add(const f64avx512& x, const f64avx512& y) { return _mm512_add_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 sub(const f32avx512& x, const f32avx512& y) { return _mm512_sub_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 sub(const f64avx512& x, const f64avx512& y) { return _mm512_sub_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 mul(const f32avx512& x, const f32avx512& y) { return _mm512_mul_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 mul(const f64avx512& x, const f64avx512& y) { return _mm512_mul_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 div(const f32avx512& x, const f32avx512& y) { return _mm512_div_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 div(const f64avx512& x, const f64avx512& y) { return _mm512_div_pd(x.v, y.v); }
+
+KFR_INTRINSIC __m512 _mm512_allones_ps()
+{
+    return _mm512_castsi512_ps(_mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(),
+                                                         _mm512_setzero_si512(), 0xFF));
+}
+
+KFR_INTRINSIC __m512d _mm512_allones_pd()
+{
+    return _mm512_castsi512_pd(_mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(),
+                                                         _mm512_setzero_si512(), 0xFF));
+}
+
+KFR_INTRINSIC __m512i _mm512_allones_si512()
+{
+    return _mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(),
+                                     0xFF);
+}
+
+KFR_INTRINSIC __m512 _mm512_not_ps(const __m512& x) { return _mm512_xor_ps(x, _mm512_allones_ps()); }
+KFR_INTRINSIC __m512d _mm512_not_pd(const __m512d& x) { return _mm512_xor_pd(x, _mm512_allones_pd()); }
+KFR_INTRINSIC __m512i _mm512_not_si512(const __m512i& x)
+{
+    return _mm512_xor_si512(x, _mm512_allones_si512());
+}
+
+KFR_INTRINSIC __m512i _mm512_highbit_epi8() { return _mm512_set1_epi8(static_cast<char>(0x80)); }
+KFR_INTRINSIC __m512i _mm512_highbit_epi16() { return _mm512_set1_epi16(static_cast<short>(0x8000)); }
+KFR_INTRINSIC __m512i _mm512_highbit_epi32() { return _mm512_set1_epi32(static_cast<int>(0x80000000)); }
+KFR_INTRINSIC __m512i _mm512_highbit_epi64() { return _mm512_set1_epi64(0x8000000000000000ll); }
+
+KFR_INTRINSIC f32avx512 eq(const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_EQ_OQ)));
+}
+KFR_INTRINSIC f64avx512 eq(const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_EQ_OQ)));
+}
+KFR_INTRINSIC f32avx512 ne(const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_NEQ_OQ)));
+}
+KFR_INTRINSIC f64avx512 ne(const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_NEQ_OQ)));
+}
+KFR_INTRINSIC f32avx512 lt(const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_LT_OQ)));
+}
+KFR_INTRINSIC f64avx512 lt(const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_LT_OQ)));
+}
+KFR_INTRINSIC f32avx512 gt(const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_GT_OQ)));
+}
+KFR_INTRINSIC f64avx512 gt(const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_GT_OQ)));
+}
+KFR_INTRINSIC f32avx512 le(const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_LE_OQ)));
+}
+KFR_INTRINSIC f64avx512 le(const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_LE_OQ)));
+}
+KFR_INTRINSIC f32avx512 ge(const f32avx512& x, const f32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_GE_OQ)));
+}
+KFR_INTRINSIC f64avx512 ge(const f64avx512& x, const f64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_GE_OQ)));
+}
+
+KFR_INTRINSIC f32avx512 band(const f32avx512& x, const f32avx512& y) { return _mm512_and_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 band(const f64avx512& x, const f64avx512& y) { return _mm512_and_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 bor(const f32avx512& x, const f32avx512& y) { return _mm512_or_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 bor(const f64avx512& x, const f64avx512& y) { return _mm512_or_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 bxor(const f32avx512& x, const f32avx512& y) { return _mm512_xor_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 bxor(const f64avx512& x, const f64avx512& y) { return _mm512_xor_pd(x.v, y.v); }
+
+#if 1
+#define KFR_knot_mask8(x) ((__mmask8)(~((u8)(x))))
+#define KFR_knot_mask16(x) ((__mmask16)(~((u16)(x))))
+#define KFR_knot_mask32(x) ((__mmask32)(~((u32)(x))))
+#define KFR_knot_mask64(x) ((__mmask64)(~((u64)(x))))
+#else
+#define KFR_knot_mask8(x) _knot_mask8(x)
+#define KFR_knot_mask16(x) _knot_mask16(x)
+#define KFR_knot_mask32(x) _knot_mask32(x)
+#define KFR_knot_mask64(x) _knot_mask64(x)
+#endif
+
+KFR_INTRINSIC i8avx512 eq(const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_movm_epi8(_mm512_cmpeq_epi8_mask(x.v, y.v));
+}
+KFR_INTRINSIC i16avx512 eq(const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_movm_epi16(_mm512_cmpeq_epi16_mask(x.v, y.v));
+}
+KFR_INTRINSIC i32avx512 eq(const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_movm_epi32(_mm512_cmpeq_epi32_mask(x.v, y.v));
+}
+KFR_INTRINSIC i64avx512 eq(const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_movm_epi64(_mm512_cmpeq_epi64_mask(x.v, y.v));
+}
+KFR_INTRINSIC i8avx512 ne(const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmpeq_epi8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i16avx512 ne(const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmpeq_epi16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i32avx512 ne(const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmpeq_epi32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i64avx512 ne(const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmpeq_epi64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i8avx512 ge(const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epi8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i16avx512 ge(const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epi16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i32avx512 ge(const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epi32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i64avx512 ge(const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epi64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i8avx512 lt(const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(x.v, y.v));
+}
+KFR_INTRINSIC i16avx512 lt(const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(x.v, y.v));
+}
+KFR_INTRINSIC i32avx512 lt(const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_movm_epi32(_mm512_cmplt_epi32_mask(x.v, y.v));
+}
+KFR_INTRINSIC i64avx512 lt(const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_movm_epi64(_mm512_cmplt_epi64_mask(x.v, y.v));
+}
+KFR_INTRINSIC i8avx512 le(const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epi8_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i16avx512 le(const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epi16_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i32avx512 le(const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epi32_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i64avx512 le(const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epi64_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i8avx512 gt(const i8avx512& x, const i8avx512& y)
+{
+    return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(y.v, x.v));
+}
+KFR_INTRINSIC i16avx512 gt(const i16avx512& x, const i16avx512& y)
+{
+    return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(y.v, x.v));
+}
+KFR_INTRINSIC i32avx512 gt(const i32avx512& x, const i32avx512& y)
+{
+    return _mm512_movm_epi32(_mm512_cmplt_epi32_mask(y.v, x.v));
+}
+KFR_INTRINSIC i64avx512 gt(const i64avx512& x, const i64avx512& y)
+{
+    return _mm512_movm_epi64(_mm512_cmplt_epi64_mask(y.v, x.v));
+}
+
+KFR_INTRINSIC u8avx512 eq(const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_movm_epi8(_mm512_cmpeq_epu8_mask(x.v, y.v));
+}
+KFR_INTRINSIC u16avx512 eq(const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_movm_epi16(_mm512_cmpeq_epu16_mask(x.v, y.v));
+}
+KFR_INTRINSIC u32avx512 eq(const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_movm_epi32(_mm512_cmpeq_epu32_mask(x.v, y.v));
+}
+KFR_INTRINSIC u64avx512 eq(const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_movm_epi64(_mm512_cmpeq_epu64_mask(x.v, y.v));
+}
+KFR_INTRINSIC u8avx512 ne(const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmpeq_epu8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u16avx512 ne(const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmpeq_epu16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u32avx512 ne(const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmpeq_epu32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u64avx512 ne(const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmpeq_epu64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u8avx512 ge(const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epu8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u16avx512 ge(const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epu16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u32avx512 ge(const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epu32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u64avx512 ge(const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epu64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u8avx512 lt(const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(x.v, y.v));
+}
+KFR_INTRINSIC u16avx512 lt(const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(x.v, y.v));
+}
+KFR_INTRINSIC u32avx512 lt(const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_movm_epi32(_mm512_cmplt_epu32_mask(x.v, y.v));
+}
+KFR_INTRINSIC u64avx512 lt(const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_movm_epi64(_mm512_cmplt_epu64_mask(x.v, y.v));
+}
+KFR_INTRINSIC u8avx512 le(const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epu8_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u16avx512 le(const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epu16_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u32avx512 le(const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epu32_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u64avx512 le(const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epu64_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u8avx512 gt(const u8avx512& x, const u8avx512& y)
+{
+    return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(y.v, x.v));
+}
+KFR_INTRINSIC u16avx512 gt(const u16avx512& x, const u16avx512& y)
+{
+    return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(y.v, x.v));
+}
+KFR_INTRINSIC u32avx512 gt(const u32avx512& x, const u32avx512& y)
+{
+    return _mm512_movm_epi32(_mm512_cmplt_epu32_mask(y.v, x.v));
+}
+KFR_INTRINSIC u64avx512 gt(const u64avx512& x, const u64avx512& y)
+{
+    return _mm512_movm_epi64(_mm512_cmplt_epu64_mask(y.v, x.v));
+}
+
+KFR_INTRINSIC i8avx512 add(const i8avx512& x, const i8avx512& y) { return _mm512_add_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 add(const i16avx512& x, const i16avx512& y) { return _mm512_add_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 add(const i32avx512& x, const i32avx512& y) { return _mm512_add_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 add(const i64avx512& x, const i64avx512& y) { return _mm512_add_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx512 add(const u8avx512& x, const u8avx512& y) { return _mm512_add_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 add(const u16avx512& x, const u16avx512& y) { return _mm512_add_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx512 add(const u32avx512& x, const u32avx512& y) { return _mm512_add_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 add(const u64avx512& x, const u64avx512& y) { return _mm512_add_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 sub(const i8avx512& x, const i8avx512& y) { return _mm512_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 sub(const i16avx512& x, const i16avx512& y) { return _mm512_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 sub(const i32avx512& x, const i32avx512& y) { return _mm512_sub_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 sub(const i64avx512& x, const i64avx512& y) { return _mm512_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx512 sub(const u8avx512& x, const u8avx512& y) { return _mm512_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 sub(const u16avx512& x, const u16avx512& y) { return _mm512_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx512 sub(const u32avx512& x, const u32avx512& y) { return _mm512_sub_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 sub(const u64avx512& x, const u64avx512& y) { return _mm512_sub_epi64(x.v, y.v); }
+
+KFR_INTRINSIC __m512i mul_epi8(const __m512i& x, const __m512i& y)
+{
+    const __m512i even = _mm512_mullo_epi16(x, y);
+    const __m512i odd  = _mm512_mullo_epi16(_mm512_srli_epi16(x, 8), _mm512_srli_epi16(y, 8));
+    return _mm512_or_si512(_mm512_slli_epi16(odd, 8), _mm512_srli_epi16(_mm512_slli_epi16(even, 8), 8));
+}
+
+KFR_INTRINSIC i8avx512 mul(const i8avx512& x, const i8avx512& y) { return mul_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 mul(const i16avx512& x, const i16avx512& y) { return _mm512_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 mul(const i32avx512& x, const i32avx512& y) { return _mm512_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 mul(const i64avx512& x, const i64avx512& y) { return _mm512_mullo_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx512 mul(const u8avx512& x, const u8avx512& y) { return mul_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 mul(const u16avx512& x, const u16avx512& y) { return _mm512_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx512 mul(const u32avx512& x, const u32avx512& y) { return _mm512_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 mul(const u64avx512& x, const u64avx512& y) { return _mm512_mullo_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 div(const i8avx512& x, const i8avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u8avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i16avx512 div(const i16avx512& x, const i16avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u16avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i32avx512 div(const i32avx512& x, const i32avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u32avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i64avx512 div(const i64avx512& x, const i64avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u8avx512 div(const u8avx512& x, const u8avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u8avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u16avx512 div(const u16avx512& x, const u16avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u16avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u32avx512 div(const u32avx512& x, const u32avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u32avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u64avx512 div(const u64avx512& x, const u64avx512& y)
+{
+    KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i8avx512 band(const i8avx512& x, const i8avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC i16avx512 band(const i16avx512& x, const i16avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC i32avx512 band(const i32avx512& x, const i32avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC i64avx512 band(const i64avx512& x, const i64avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u8avx512 band(const u8avx512& x, const u8avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u16avx512 band(const u16avx512& x, const u16avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u32avx512 band(const u32avx512& x, const u32avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u64avx512 band(const u64avx512& x, const u64avx512& y) { return _mm512_and_si512(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 bor(const i8avx512& x, const i8avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC i16avx512 bor(const i16avx512& x, const i16avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC i32avx512 bor(const i32avx512& x, const i32avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC i64avx512 bor(const i64avx512& x, const i64avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u8avx512 bor(const u8avx512& x, const u8avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u16avx512 bor(const u16avx512& x, const u16avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u32avx512 bor(const u32avx512& x, const u32avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u64avx512 bor(const u64avx512& x, const u64avx512& y) { return _mm512_or_si512(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 bxor(const i8avx512& x, const i8avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC i16avx512 bxor(const i16avx512& x, const i16avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC i32avx512 bxor(const i32avx512& x, const i32avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC i64avx512 bxor(const i64avx512& x, const i64avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u8avx512 bxor(const u8avx512& x, const u8avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u16avx512 bxor(const u16avx512& x, const u16avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u32avx512 bxor(const u32avx512& x, const u32avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u64avx512 bxor(const u64avx512& x, const u64avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+
+KFR_INTRINSIC f32avx512 shl(const f32avx512& x, unsigned y)
+{
+    return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(x.v), y));
+}
+KFR_INTRINSIC f64avx512 shl(const f64avx512& x, unsigned y)
+{
+    return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(x.v), y));
+}
+KFR_INTRINSIC f32avx512 shr(const f32avx512& x, unsigned y)
+{
+    return _mm512_castsi512_ps(_mm512_srli_epi32(_mm512_castps_si512(x.v), y));
+}
+KFR_INTRINSIC f64avx512 shr(const f64avx512& x, unsigned y)
+{
+    return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(x.v), y));
+}
+
+KFR_INTRINSIC u16avx512 shl(const u16avx512& x, unsigned y) { return _mm512_slli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx512 shl(const u32avx512& x, unsigned y) { return _mm512_slli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx512 shl(const i16avx512& x, unsigned y) { return _mm512_slli_epi16(x.v, y); }
+KFR_INTRINSIC i32avx512 shl(const i32avx512& x, unsigned y) { return _mm512_slli_epi32(x.v, y); }
+KFR_INTRINSIC u16avx512 shr(const u16avx512& x, unsigned y) { return _mm512_srli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx512 shr(const u32avx512& x, unsigned y) { return _mm512_srli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx512 shr(const i16avx512& x, unsigned y) { return _mm512_srai_epi16(x.v, y); }
+KFR_INTRINSIC i32avx512 shr(const i32avx512& x, unsigned y) { return _mm512_srai_epi32(x.v, y); }
+
+KFR_INTRINSIC u64avx512 shl(const u64avx512& x, unsigned y) { return _mm512_slli_epi64(x.v, y); }
+KFR_INTRINSIC u64avx512 shr(const u64avx512& x, unsigned y) { return _mm512_srli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx512 shl(const i64avx512& x, unsigned y) { return _mm512_slli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx512 shr(const i64avx512& x, unsigned y)
+{
+    KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = x[i] >> y);
+}
+
+KFR_INTRINSIC u8avx512 shl(const u8avx512& x, unsigned y)
+{
+    __m512i l  = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+    __m512i h  = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+    __m512i ll = _mm512_slli_epi16(l, y);
+    __m512i hh = _mm512_slli_epi16(h, y);
+
+    return _mm512_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx512 shl(const i8avx512& x, unsigned y)
+{
+    __m512i l  = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+    __m512i h  = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+    __m512i ll = _mm512_slli_epi16(l, y);
+    __m512i hh = _mm512_slli_epi16(h, y);
+
+    return _mm512_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC u8avx512 shr(const u8avx512& x, unsigned y)
+{
+    __m512i l  = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+    __m512i h  = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+    __m512i ll = _mm512_srli_epi16(l, y);
+    __m512i hh = _mm512_srli_epi16(h, y);
+
+    return _mm512_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx512 shr(const i8avx512& x, unsigned y)
+{
+    __m512i l  = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+    __m512i h  = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+    __m512i ll = _mm512_srai_epi16(l, y);
+    __m512i hh = _mm512_srai_epi16(h, y);
+
+    return _mm512_packs_epi16(ll, hh);
+}
+
+KFR_INTRINSIC u32avx512 shl(const u32avx512& x, const u32avx512& y) { return _mm512_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx512 shl(const i32avx512& x, const u32avx512& y) { return _mm512_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 shl(const u64avx512& x, const u64avx512& y) { return _mm512_sllv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 shl(const i64avx512& x, const u64avx512& y) { return _mm512_sllv_epi64(x.v, y.v); }
+
+KFR_INTRINSIC u32avx512 shr(const u32avx512& x, const u32avx512& y) { return _mm512_srlv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx512 shr(const i32avx512& x, const u32avx512& y) { return _mm512_srav_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 shr(const u64avx512& x, const u64avx512& y) { return _mm512_srlv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 shr(const i64avx512& x, const u64avx512& y) { return _mm512_srav_epi64(x.v, y.v); }
+
+KFR_INTRINSIC f32avx512 shl(const f32avx512& x, const u32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_sllv_epi32(_mm512_castps_si512(x.v), y.v));
+}
+KFR_INTRINSIC f64avx512 shl(const f64avx512& x, const u64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(x.v), y.v));
+}
+KFR_INTRINSIC f32avx512 shr(const f32avx512& x, const u32avx512& y)
+{
+    return _mm512_castsi512_ps(_mm512_srlv_epi32(_mm512_castps_si512(x.v), y.v));
+}
+KFR_INTRINSIC f64avx512 shr(const f64avx512& x, const u64avx512& y)
+{
+    return _mm512_castsi512_pd(_mm512_srlv_epi64(_mm512_castpd_si512(x.v), y.v));
+}
+
+#endif
+
+#endif
+
+#endif
+
+#define KFR_HANDLE_ALL_SIZES_SHIFT_2(fn)                                                                     \
+    template <typename T, size_t N,                                                                          \
+              KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)>           \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const unsigned b)                                         \
+    {                                                                                                        \
+        return slice<0, N>(fn(expand_simd(a), b));                                                           \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value),            \
+              typename = void>                                                                               \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const unsigned b)                                         \
+    {                                                                                                        \
+        return concat(fn(low(a), b), fn(high(a), b));                                                        \
+    }
+#define KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(fn)                                                                 \
+    template <typename T, size_t N,                                                                          \
+              KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)>           \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<utype<T>, N>& b)                                \
+    {                                                                                                        \
+        return slice<0, N>(fn(expand_simd(a), expand_simd(b)));                                              \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value),            \
+              typename = void>                                                                               \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<utype<T>, N>& b)                                \
+    {                                                                                                        \
+        return concat(fn(low(a), low(b)), fn(high(a), high(b)));                                             \
+    }
+
+KFR_HANDLE_ALL_SIZES_2(add)
+KFR_HANDLE_ALL_SIZES_2(sub)
+KFR_HANDLE_ALL_SIZES_2(mul)
+KFR_HANDLE_ALL_SIZES_2(div)
+
+KFR_HANDLE_ALL_SIZES_2(eq)
+KFR_HANDLE_ALL_SIZES_2(ne)
+KFR_HANDLE_ALL_SIZES_2(lt)
+KFR_HANDLE_ALL_SIZES_2(gt)
+KFR_HANDLE_ALL_SIZES_2(le)
+KFR_HANDLE_ALL_SIZES_2(ge)
+
+KFR_HANDLE_ALL_SIZES_2(band)
+KFR_HANDLE_ALL_SIZES_2(bor)
+KFR_HANDLE_ALL_SIZES_2(bxor)
+
+KFR_HANDLE_ALL_SIZES_SHIFT_2(shl)
+KFR_HANDLE_ALL_SIZES_SHIFT_2(shr)
+KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(shl)
+KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(shr)
+
+#else
+
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y[i])));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, unsigned y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y)));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y[i])));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y)));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] == y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] != y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] >= y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] <= y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] > y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] < y[i]));
+}
+
+template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bor(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>((ubitcast(x[i]) | ubitcast(y[i])))));
+}
+template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bxor(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>(ubitcast(x[i]) ^ ubitcast(y[i]))));
+}
+template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> band(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>(ubitcast(x[i]) & ubitcast(y[i]))));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> add(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = x[i] + y[i]);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> sub(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = x[i] - y[i]);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> mul(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = x[i] * y[i]);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> div(const vec<T, N>& x, const vec<T, N>& y)
+{
+    KFR_COMPONENTWISE_RET(result[i] = x[i] / y[i]);
+}
+
+#define KFR_HANDLE_VEC_SCA(fn)                                                                               \
+    template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>                                   \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& x, const T& y)                                               \
+    {                                                                                                        \
+        return fn(x, vec<T, N>(y));                                                                          \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>                                   \
+    KFR_INTRINSIC vec<T, N> fn(const T& x, const vec<T, N>& y)                                               \
+    {                                                                                                        \
+        return fn(vec<T, N>(x), y);                                                                          \
+    }
+
+KFR_HANDLE_VEC_SCA(add)
+KFR_HANDLE_VEC_SCA(sub)
+KFR_HANDLE_VEC_SCA(mul)
+KFR_HANDLE_VEC_SCA(div)
+KFR_HANDLE_VEC_SCA(band)
+KFR_HANDLE_VEC_SCA(bor)
+KFR_HANDLE_VEC_SCA(bxor)
+KFR_HANDLE_VEC_SCA(eq)
+KFR_HANDLE_VEC_SCA(ne)
+KFR_HANDLE_VEC_SCA(lt)
+KFR_HANDLE_VEC_SCA(gt)
+KFR_HANDLE_VEC_SCA(le)
+KFR_HANDLE_VEC_SCA(ge)
+
+#endif
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> bnot(const vec<T, N>& x)
+{
+    return bxor(special_constants<T>::allones(), x);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x)
+{
+    return sub(T(0), x);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x)
+{
+    return bxor(special_constants<T>::highbitmask(), x);
+}
+
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/simd/impl/function.hpp b/include/kfr/simd/impl/function.hpp
@@ -0,0 +1,295 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../base/expression.hpp"
+#include "../shuffle.hpp"
+#include "../types.hpp"
+#include "../vec.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+#define KFR_HANDLE_NOT_F_1(fn)                                                                               \
+    template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>                                    \
+    KFR_INTRINSIC vec<flt_type<T>, N> fn(const vec<T, N>& a) CMT_NOEXCEPT                                    \
+    {                                                                                                        \
+        return intrinsics::fn(elemcast<flt_type<T>>(a));                                                     \
+    }
+
+#define KFR_HANDLE_SCALAR(fn)                                                                                \
+    template <typename T1, typename... Args, typename Tout = ::kfr::common_type<T1, Args...>,                \
+              KFR_ENABLE_IF(!or_t<is_vec<T1>, is_vec<Args>...>::value)>                                      \
+    KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT                                        \
+    {                                                                                                        \
+        using vecout = vec1<Tout>;                                                                           \
+        return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
+    }
+
+#define KFR_HANDLE_SCALAR_1_T(fn, Tout)                                                                      \
+    template <typename T1, typename... Args, typename T = ::kfr::common_type<T1, Args...>,                   \
+              KFR_ENABLE_IF(!or_t<is_vec<T1>, is_vec<Args>...>::value)>                                      \
+    KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT                                        \
+    {                                                                                                        \
+        using vecout = vec1<Tout>;                                                                           \
+        return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
+    }
+
+#define KFR_HANDLE_ARGS_T(fn, Tout)                                                                          \
+    template <typename T1, typename... Args, typename T = ::kfr::common_type<T1, Args...>,                   \
+              KFR_ENABLE_IF(or_t<is_vec<T1>, is_vec<Args>...>::value)>                                       \
+    KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT                                        \
+    {                                                                                                        \
+        using vecout = vec1<Tout>;                                                                           \
+        return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
+    }
+
+namespace intrinsics
+{
+#ifdef CMT_ARCH_X86
+using f32sse = vec<f32, 4>;
+using f64sse = vec<f64, 2>;
+using i8sse  = vec<i8, 16>;
+using i16sse = vec<i16, 8>;
+using i32sse = vec<i32, 4>;
+using i64sse = vec<i64, 2>;
+using u8sse  = vec<u8, 16>;
+using u16sse = vec<u16, 8>;
+using u32sse = vec<u32, 4>;
+using u64sse = vec<u64, 2>;
+
+using f32avx = vec<f32, 8>;
+using f64avx = vec<f64, 4>;
+using i8avx  = vec<i8, 32>;
+using i16avx = vec<i16, 16>;
+using i32avx = vec<i32, 8>;
+using i64avx = vec<i64, 4>;
+using u8avx  = vec<u8, 32>;
+using u16avx = vec<u16, 16>;
+using u32avx = vec<u32, 8>;
+using u64avx = vec<u64, 4>;
+
+using f32avx512 = vec<f32, 16>;
+using f64avx512 = vec<f64, 8>;
+using i8avx512  = vec<i8, 64>;
+using i16avx512 = vec<i16, 32>;
+using i32avx512 = vec<i32, 16>;
+using i64avx512 = vec<i64, 8>;
+using u8avx512  = vec<u8, 64>;
+using u16avx512 = vec<u16, 32>;
+using u32avx512 = vec<u32, 16>;
+using u64avx512 = vec<u64, 8>;
+
+#else
+using f32neon = vec<f32, 4>;
+using f64neon = vec<f64, 2>;
+using i8neon  = vec<i8, 16>;
+using i16neon = vec<i16, 8>;
+using i32neon = vec<i32, 4>;
+using i64neon = vec<i64, 2>;
+using u8neon  = vec<u8, 16>;
+using u16neon = vec<u16, 8>;
+using u32neon = vec<u32, 4>;
+using u64neon = vec<u64, 2>;
+#endif
+
+template <typename T>
+constexpr inline size_t next_simd_width(size_t n) CMT_NOEXCEPT
+{
+    return n < minimum_vector_width<T> ? minimum_vector_width<T> : next_poweroftwo(n);
+}
+
+template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
+KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, 1>& x) CMT_NOEXCEPT
+{
+    return broadcast<Nout>(x);
+}
+
+template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
+KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x) CMT_NOEXCEPT
+{
+    return extend<Nout>(x);
+}
+
+template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
+KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) CMT_NOEXCEPT
+{
+    return widen<Nout>(x, value);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
+                          Fn&& fn)
+{
+    result = fn(a, b, c);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
+                          Fn&& fn)
+{
+    intrin(result.h.low, a.h.low, b.h.low, c.h.low, fn);
+    intrin(result.h.high, a.h.high, b.h.high, c.h.high, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn)
+{
+    result = fn(a);
+}
+
+template <typename T, size_t Nvec = vector_width<T>, size_t N, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn)
+{
+    intrin(result.h.low, a.h.low, fn);
+    intrin(result.h.high, a.h.high, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn)
+{
+    result = fn(a, b);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn)
+{
+    intrin(result.h.low, a.h.low, b.h.low, fn);
+    intrin(result.h.high, a.h.high, b.h.high, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn)
+{
+    result = fn(a, b);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn)
+{
+    intrin(result.h.low, a.h.low, b, fn);
+    intrin(result.h.high, a.h.high, b, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn)
+{
+    result = fn(a, b);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn)
+{
+    intrin(result.h.low, a, b.h.low, fn);
+    intrin(result.h.high, a, b.h.high, fn);
+}
+
+#define KFR_HANDLE_ALL_SIZES_1_IF(fn, cond)                                                                  \
+    template <typename T, size_t N,                                                                          \
+              KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value && cond)>   \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT                                              \
+    {                                                                                                        \
+        constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
+        return intrinsics::fn(a.shuffle(csizeseq<Nout>)).shuffle(csizeseq<N>);                               \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value && cond),    \
+              typename = void>                                                                               \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT                                              \
+    {                                                                                                        \
+        vec<T, N> r;                                                                                         \
+        intrin(r, a, [](const auto& x) { return intrinsics::fn(x); });                                       \
+        return r;                                                                                            \
+    }
+
+#define KFR_HANDLE_ALL_SIZES_1(fn) KFR_HANDLE_ALL_SIZES_1_IF(fn, true)
+
+#define KFR_HANDLE_ALL_SIZES_2(fn)                                                                           \
+    template <typename T, size_t N,                                                                          \
+              KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)>           \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT                          \
+    {                                                                                                        \
+        constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
+        return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), b.shuffle(csizeseq_t<Nout>()))                  \
+            .shuffle(csizeseq<N>);                                                                           \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value),            \
+              typename = void>                                                                               \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT                          \
+    {                                                                                                        \
+        vec<T, N> r;                                                                                         \
+        intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); });                  \
+        return r;                                                                                            \
+    }                                                                                                        \
+    template <typename T, size_t N,                                                                          \
+              KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)>           \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT                                  \
+    {                                                                                                        \
+        constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
+        return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), vec<T, Nout>(b)).shuffle(csizeseq<N>);          \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value),            \
+              typename = void>                                                                               \
+    KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT                                  \
+    {                                                                                                        \
+        vec<T, N> r;                                                                                         \
+        intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); });                  \
+        return r;                                                                                            \
+    }                                                                                                        \
+    template <typename T, size_t N,                                                                          \
+              KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)>           \
+    KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT                                  \
+    {                                                                                                        \
+        constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
+        return intrinsics::fn(vec<T, Nout>(a), b.shuffle(csizeseq_t<Nout>())).shuffle(csizeseq<N>);          \
+    }                                                                                                        \
+    template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value),            \
+              typename = void>                                                                               \
+    KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT                                  \
+    {                                                                                                        \
+        vec<T, N> r;                                                                                         \
+        intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); });                  \
+        return r;                                                                                            \
+    }
+
+template <typename T>
+using vec1 = conditional<is_vec<T>::value, T, vec<T, 1>>;
+
+template <typename T>
+inline const T& to_scalar(const T& value) CMT_NOEXCEPT
+{
+    return value;
+}
+template <typename T>
+inline T to_scalar(const vec<T, 1>& value) CMT_NOEXCEPT
+{
+    return value[0];
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/impl/intrinsics.h b/include/kfr/simd/impl/intrinsics.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "../../cident.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef CMT_ARCH_SSE2
+#include <immintrin.h>
+#ifdef CMT_OS_WIN
+#include <intrin.h>
+#endif
+#endif
+
+#ifdef CMT_ARCH_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined CMT_COMPILER_GCC && defined CMT_ARCH_X86
+#include <x86intrin.h>
+#endif
+
+#ifdef CMT_COMPILER_CLANG
+#define builtin_addressof(x) __builtin_addressof(x)
+#else
+template <class T>
+inline T* builtin_addressof(T& arg)
+{
+    return reinterpret_cast<T*>(&const_cast<char&>(reinterpret_cast<const volatile char&>(arg)));
+}
+#endif
+
+#ifdef CMT_COMPILER_GNU
+CMT_INLINE float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
+CMT_INLINE double builtin_sqrt(double x) { return __builtin_sqrt(x); }
+CMT_INLINE long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
+CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size)
+{
+    __builtin_memcpy(dest, src, size);
+}
+CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); }
+#else
+CMT_INLINE float builtin_sqrt(float x) { return ::sqrtf(x); }
+CMT_INLINE double builtin_sqrt(double x) { return ::sqrt(x); }
+CMT_INLINE long double builtin_sqrt(long double x) { return ::sqrtl(x); }
+CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) { ::memcpy(dest, src, size); }
+CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { ::memset(dest, val, size); }
+#endif
+
+#define KFR_ENABLE_IF CMT_ENABLE_IF
diff --git a/include/kfr/simd/impl/operators.hpp b/include/kfr/simd/impl/operators.hpp
@@ -0,0 +1,164 @@
+/** @addtogroup basic_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "function.hpp"
+
+#ifdef CMT_CLANG_EXT
+#include "basicoperators_clang.hpp"
+#else
+#include "basicoperators_generic.hpp"
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> neg(const vec<complex<T>, N>& x)
+{
+    return neg(x.flatten()).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> add(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+    return add(x.flatten(), y.flatten()).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> sub(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+    return sub(x.flatten(), y.flatten()).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> mul(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+    const vec<T, (N * 2)> xx = x.v;
+    const vec<T, (N * 2)> yy = y.v;
+    return subadd(mul(xx, dupeven(yy)), mul(swap<2>(xx), dupodd(yy))).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> div(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+    const vec<T, (N * 2)> xx = x.v;
+    const vec<T, (N * 2)> yy = y.v;
+    const vec<T, (N * 2)> m  = (add(sqr(dupeven(yy)), sqr(dupodd(yy))));
+    return swap<2>(subadd(mul(swap<2>(xx), dupeven(yy)), mul(xx, dupodd(yy))) / m).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> bor(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+    return bor(x.flatten(), y.flatten()).v;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> bxor(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+    return bxor(x.flatten(), y.flatten()).v;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> band(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+    return band(x.flatten(), y.flatten()).v;
+}
+
+#define KFR_COMPLEX_OP_CVT(fn)                                                                               \
+    template <typename T, size_t N>                                                                          \
+    KFR_INTRINSIC vec<complex<T>, N> fn(const vec<complex<T>, N>& x, const complex<T>& y)                    \
+    {                                                                                                        \
+        return fn(x, vec<complex<T>, N>(y));                                                                 \
+    }                                                                                                        \
+    template <typename T, size_t N>                                                                          \
+    KFR_INTRINSIC vec<complex<T>, N> fn(const complex<T>& x, const vec<complex<T>, N>& y)                    \
+    {                                                                                                        \
+        return fn(vec<complex<T>, N>(x), y);                                                                 \
+    }
+
+KFR_COMPLEX_OP_CVT(mul)
+KFR_COMPLEX_OP_CVT(div)
+KFR_COMPLEX_OP_CVT(band)
+KFR_COMPLEX_OP_CVT(bxor)
+KFR_COMPLEX_OP_CVT(bor)
+
+#define KFR_VECVEC_OP1(fn)                                                                                   \
+    template <typename T1, size_t N1, size_t N2>                                                             \
+    KFR_INTRINSIC vec<vec<T1, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x)                                     \
+    {                                                                                                        \
+        return fn(x.flatten()).v;                                                                            \
+    }
+
+#define KFR_VECVEC_OP2(fn)                                                                                   \
+    template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>,              \
+              KFR_ENABLE_IF(is_simd_type<C>::value)>                                                         \
+    KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const vec<vec<T2, N1>, N2>& y)       \
+    {                                                                                                        \
+        return fn(innercast<C>(x.flatten()), innercast<C>(y.flatten())).v;                                   \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>,              \
+              KFR_ENABLE_IF(is_simd_type<C>::value)>                                                         \
+    KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const T2& y)                         \
+    {                                                                                                        \
+        return fn(innercast<C>(x.flatten()), innercast<C>(y)).v;                                             \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>,              \
+              KFR_ENABLE_IF(is_simd_type<C>::value)>                                                         \
+    KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const vec<T2, N1>& y)                \
+    {                                                                                                        \
+        return fn(innercast<C>(x.flatten()), repeat<N2>(innercast<C>(y.flatten()))).v;                       \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>,              \
+              KFR_ENABLE_IF(is_simd_type<C>::value)>                                                         \
+    KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const T1& x, const vec<vec<T2, N1>, N2>& y)                         \
+    {                                                                                                        \
+        return fn(innercast<C>(x), innercast<C>(y.flatten())).v;                                             \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>,              \
+              KFR_ENABLE_IF(is_simd_type<C>::value)>                                                         \
+    KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<T1, N1>& x, const vec<vec<T2, N1>, N2>& y)                \
+    {                                                                                                        \
+        return fn(repeat<N2>(innercast<C>(x.flatten())), innercast<C>(y.flatten())).v;                       \
+    }
+
+KFR_VECVEC_OP1(neg)
+KFR_VECVEC_OP1(bnot)
+KFR_VECVEC_OP2(add)
+KFR_VECVEC_OP2(sub)
+KFR_VECVEC_OP2(mul)
+KFR_VECVEC_OP2(div)
+KFR_VECVEC_OP2(band)
+KFR_VECVEC_OP2(bor)
+KFR_VECVEC_OP2(bxor)
+
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/simd.hpp b/include/kfr/simd/impl/simd.hpp
@@ -0,0 +1,183 @@
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../platform.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+#if defined CMT_COMPILER_GNU
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); }
+#elif defined CMT_COMPILER_MSVC
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); }
+#else
+inline f32 allones_f32() CMT_NOEXCEPT
+{
+    return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu)));
+}
+inline f64 allones_f64() CMT_NOEXCEPT
+{
+    return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull)));
+}
+inline f32 invhighbit_f32() CMT_NOEXCEPT
+{
+    return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu)));
+}
+inline f64 invhighbit_f64() CMT_NOEXCEPT
+{
+    return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull)));
+}
+#endif
+
+template <typename T>
+struct special_scalar_constants
+{
+    constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); }
+    constexpr static T allones() { return static_cast<T>(-1ll); }
+    constexpr static T allzeros() { return T(0); }
+    constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); }
+};
+
+#ifndef CMT_COMPILER_INTEL
+#define KFR_CONSTEXPR_NON_INTEL constexpr
+#else
+#define KFR_CONSTEXPR_NON_INTEL
+#endif
+
+template <>
+struct special_scalar_constants<float>
+{
+    constexpr static float highbitmask() { return -0.f; }
+    KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); };
+    constexpr static float allzeros() { return 0.f; }
+    KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); }
+};
+
+template <>
+struct special_scalar_constants<double>
+{
+    constexpr static double highbitmask() { return -0.; }
+    KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); };
+    constexpr static double allzeros() { return 0.; }
+    KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); }
+};
+
+template <typename T>
+struct special_constants : public special_scalar_constants<subtype<T>>
+{
+public:
+    using Tsub = subtype<T>;
+};
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+struct simd_t
+{
+    using value_type = T;
+
+    constexpr static size_t size() { return N; }
+};
+
+template <typename T, size_t N1, size_t N2>
+struct simd2_t
+{
+    using value_type = T;
+
+    constexpr static size_t size1() { return N1; }
+
+    constexpr static size_t size2() { return N2; }
+};
+
+template <typename Tout, typename Tin, size_t N>
+struct simd_cvt_t
+{
+    using value_type_out = Tout;
+    using value_type_in  = Tin;
+
+    constexpr static size_t size() { return N; }
+};
+
+template <typename T, size_t N>
+constexpr size_t alignment()
+{
+    return const_min(size_t(platform<>::native_vector_alignment), next_poweroftwo(sizeof(T) * N));
+}
+
+template <typename T, size_t N>
+struct alignas(alignment<T, N>()) simd_array
+{
+    T val[next_poweroftwo(N)];
+};
+
+template <typename T, size_t N>
+struct simd_type;
+
+template <typename T>
+struct simd_type<T, 0>
+{
+    // SFINAE
+};
+
+template <typename T, size_t N>
+struct simd_halves
+{
+    using subtype = typename simd_type<T, prev_poweroftwo(N - 1)>::type;
+
+    subtype low;
+    subtype high;
+#if KFR_DEFINE_CTORS_FOR_HALVES
+    simd_halves() CMT_NOEXCEPT {}
+    simd_halves(const subtype& l, const subtype& h) CMT_NOEXCEPT : low(l), high(h) {}
+    simd_halves(const simd_halves& v) CMT_NOEXCEPT : low(v.low), high(v.high) {}
+    simd_halves(simd_halves&& v) CMT_NOEXCEPT : low(v.low), high(v.high) {}
+
+    simd_halves& operator=(const simd_halves& v) CMT_NOEXCEPT
+    {
+        low  = v.low;
+        high = v.high;
+        return *this;
+    }
+    simd_halves& operator=(simd_halves&& v) CMT_NOEXCEPT
+    {
+        low  = v.low;
+        high = v.high;
+        return *this;
+    }
+#endif
+};
+
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/specializations.i b/include/kfr/simd/impl/specializations.i
@@ -0,0 +1,116 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ */
+#pragma once
+
+#include "../vec.hpp"
+#ifndef KFR_SHUFFLE_SPECIALIZATIONS
+#include "../shuffle.hpp"
+#endif
+
+#ifdef KFR_COMPILER_GNU
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <>
+inline vec<f32, 32> shufflevector<f32, 32>(
+    csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14,
+             15, 22, 23, 30, 31>,
+    const vec<f32, 32>& x, const vec<f32, 32>&)
+{
+    f32x32 w = x;
+
+    w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)),
+               permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w)));
+
+    w = permutegroups<(4), 0, 4, 2, 6, 1, 5, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
+    return w;
+}
+
+template <>
+inline vec<f32, 32> shufflevector<f32, 32>(
+    csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22,
+             23, 14, 15, 30, 31>,
+    const vec<f32, 32>& x, const vec<f32, 32>&)
+{
+    f32x32 w = x;
+
+    w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)),
+               permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w)));
+
+    w = permutegroups<(4), 0, 4, 1, 5, 2, 6, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
+    return w;
+}
+
+inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x)
+{
+    return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18,
+                                         19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>,
+                                  x, x);
+}
+
+template <>
+inline vec<f32, 64> shufflevector<f32, 64>(
+    csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13,
+             44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38,
+             39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>,
+    const vec<f32, 64>& x, const vec<f32, 64>&)
+{
+    return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>(
+        concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x))));
+}
+
+template <>
+inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>,
+                                           const vec<f32, 16>& x, const vec<f32, 16>&)
+{
+    //    asm volatile("int $3");
+    const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x);
+
+    return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)),
+                  shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx)));
+}
+
+template <>
+inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>,
+                                           const vec<f32, 16>& x, const vec<f32, 16>&)
+{
+    const vec<f32, 16> xx =
+        concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
+
+    return permutegroups<(4), 0, 2, 1, 3>(xx);
+}
+
+template <>
+inline vec<f32, 32> shufflevector<f32, 32>(
+    csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
+             29, 14, 30, 15, 31>,
+    const vec<f32, 32>& x, const vec<f32, 32>&)
+{
+    const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x);
+
+    return concat(interleavehalfs(low(xx)), interleavehalfs(high(xx)));
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+#endif
diff --git a/include/kfr/simd/mask.hpp b/include/kfr/simd/mask.hpp
@@ -0,0 +1,155 @@
+/** @addtogroup logical
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "vec.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T>
+using maskfor = typename T::mask_t;
+
+namespace internal
+{
+
+template <typename T>
+constexpr inline T maskbits(bool value)
+{
+    return value ? special_constants<T>::allones() : special_constants<T>::allzeros();
+}
+} // namespace internal
+
+template <typename T, size_t N>
+struct mask : protected vec<T, N>
+{
+    using base = vec<T, N>;
+
+    KFR_MEM_INTRINSIC mask() CMT_NOEXCEPT = default;
+
+    KFR_MEM_INTRINSIC mask(const mask&) CMT_NOEXCEPT = default;
+
+    KFR_MEM_INTRINSIC mask& operator=(const mask&) CMT_NOEXCEPT = default;
+
+    using simd_type = typename base::simd_type;
+
+    KFR_MEM_INTRINSIC mask(bool arg) : base(internal::maskbits<T>(arg)) {}
+
+    template <typename... Args>
+    KFR_MEM_INTRINSIC mask(bool arg1, bool arg2, Args... args)
+        : base(internal::maskbits<T>(arg1), internal::maskbits<T>(arg2),
+               internal::maskbits<T>(static_cast<bool>(args))...)
+    {
+    }
+
+    using vec<T, N>::v;
+
+    KFR_MEM_INTRINSIC mask(const base& v) CMT_NOEXCEPT;
+
+    KFR_MEM_INTRINSIC mask(const simd_type& simd) : base(simd) {}
+
+    template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
+    KFR_MEM_INTRINSIC mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
+    {
+    }
+
+    template <typename U, KFR_ENABLE_IF(sizeof(T) != sizeof(U))>
+    KFR_MEM_INTRINSIC mask(const mask<U, N>& m)
+        : base(base::frombits(innercast<itype<T>>(vec<itype<U>, N>::frombits(m.asvec()))))
+    {
+    }
+
+    KFR_MEM_INTRINSIC bool operator[](size_t index) const CMT_NOEXCEPT;
+
+    KFR_MEM_INTRINSIC constexpr base asvec() const CMT_NOEXCEPT { return base(v); }
+};
+
+namespace internal
+{
+
+template <typename T, size_t Nout, size_t N1, size_t... indices>
+constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>)
+{
+    return make_vector(maskbits<T>(indices < N1)...);
+}
+
+template <typename T, size_t Nout, size_t N1>
+constexpr vec<T, Nout> partial_mask()
+{
+    return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>());
+}
+} // namespace internal
+
+template <typename T, size_t N>
+KFR_MEM_INTRINSIC bool mask<T, N>::operator[](size_t index) const CMT_NOEXCEPT
+{
+    return ibitcast(base::operator[](index)) < 0;
+}
+
+template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)>
+constexpr KFR_INTRINSIC mask<T, Nout> make_mask(bool arg, Args... args)
+{
+    return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...);
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+namespace cometa
+{
+
+template <typename T, size_t N>
+struct compound_type_traits<kfr::mask<T, N>>
+{
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
+    template <typename U>
+    using rebind = kfr::mask<U, N>;
+    template <typename U>
+    using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
+
+    KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::mask<T, N>& value, size_t index)
+    {
+        return value[index];
+    }
+};
+} // namespace cometa
+
+namespace std
+{
+template <typename T1, typename T2, size_t N>
+struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
+{
+    using type = kfr::mask<typename common_type<T1, T2>::type, N>;
+};
+} // namespace std
diff --git a/include/kfr/simd/operators.hpp b/include/kfr/simd/operators.hpp
@@ -0,0 +1,810 @@
+/** @addtogroup basic_math
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/operators.hpp"
+#include "mask.hpp"
+#include <algorithm>
+#include <utility>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+#define KFR_VEC_OPERATOR1(op, fn)                                                                            \
+    template <typename T, size_t N>                                                                          \
+    constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x)                                        \
+    {                                                                                                        \
+        return intrinsics::fn(x);                                                                            \
+    }
+
+#define KFR_VEC_OPERATOR2(op, asgnop, fn)                                                                    \
+    template <typename T1, typename T2, size_t N>                                                            \
+    constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const vec<T2, N>& y)                  \
+    {                                                                                                        \
+        x = intrinsics::fn(x, elemcast<T1>(y));                                                              \
+        return x;                                                                                            \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N>                                                            \
+    constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const T2& y)                          \
+    {                                                                                                        \
+        x = intrinsics::fn(x, T1(y));                                                                        \
+        return x;                                                                                            \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>                          \
+    constexpr KFR_INTRINSIC vec<C, N> operator op(const vec<T1, N>& x, const T2& y)                          \
+    {                                                                                                        \
+        return intrinsics::fn(elemcast<C>(x), C(y));                                                         \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>                          \
+    constexpr KFR_INTRINSIC vec<C, N> operator op(const T1& x, const vec<T2, N>& y)                          \
+    {                                                                                                        \
+        return intrinsics::fn(C(x), elemcast<C>(y));                                                         \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>                          \
+    constexpr KFR_INTRINSIC vec<C, N> operator op(const vec<T1, N>& x, const vec<T2, N>& y)                  \
+    {                                                                                                        \
+        return intrinsics::fn(elemcast<C>(x), elemcast<C>(y));                                               \
+    }
+
+#define KFR_VEC_SHIFT_OPERATOR(op, asgnop, fn)                                                               \
+    template <typename T1, size_t N>                                                                         \
+    constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, unsigned y)                           \
+    {                                                                                                        \
+        x = intrinsics::fn(x, y);                                                                            \
+        return x;                                                                                            \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N>                                                            \
+    constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const vec<T2, N>& y)                  \
+    {                                                                                                        \
+        x = intrinsics::fn(x, elemcast<utype<T1>>(y));                                                       \
+        return x;                                                                                            \
+    }                                                                                                        \
+    template <typename T, size_t N>                                                                          \
+    constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x, unsigned y)                            \
+    {                                                                                                        \
+        return intrinsics::fn(x, y);                                                                         \
+    }                                                                                                        \
+    template <typename T, typename T2, size_t N>                                                             \
+    constexpr KFR_INTRINSIC vec<T, N> operator op(const T& x, const vec<T2, N>& y)                           \
+    {                                                                                                        \
+        return intrinsics::fn(innercast<T>(x), elemcast<utype<T>>(y));                                       \
+    }                                                                                                        \
+    template <typename T, typename T2, size_t N>                                                             \
+    constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x, const vec<T2, N>& y)                   \
+    {                                                                                                        \
+        return intrinsics::fn(x, elemcast<utype<T>>(y));                                                     \
+    }
+
+#define KFR_VEC_CMP_OPERATOR(op, fn)                                                                         \
+    template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>                          \
+    constexpr KFR_INTRINSIC mask<C, N> operator op(const vec<T1, N>& x, const T2& y)                         \
+    {                                                                                                        \
+        return intrinsics::fn(elemcast<C>(x), vec<C, N>(y)).asmask();                                        \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>                          \
+    constexpr KFR_INTRINSIC mask<C, N> operator op(const T1& x, const vec<T2, N>& y)                         \
+    {                                                                                                        \
+        return intrinsics::fn(vec<C, N>(x), elemcast<C>(y)).asmask();                                        \
+    }                                                                                                        \
+    template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>                          \
+    constexpr KFR_INTRINSIC mask<C, N> operator op(const vec<T1, N>& x, const vec<T2, N>& y)                 \
+    {                                                                                                        \
+        return intrinsics::fn(elemcast<C>(x), elemcast<C>(y)).asmask();                                      \
+    }
+
+KFR_VEC_OPERATOR1(-, neg)
+KFR_VEC_OPERATOR1(~, bnot)
+
+KFR_VEC_OPERATOR2(+, +=, add)
+KFR_VEC_OPERATOR2(-, -=, sub)
+KFR_VEC_OPERATOR2(*, *=, mul)
+KFR_VEC_OPERATOR2(/, /=, div)
+
+KFR_VEC_OPERATOR2(&, &=, band)
+KFR_VEC_OPERATOR2(|, |=, bor)
+KFR_VEC_OPERATOR2 (^, ^=, bxor)
+KFR_VEC_SHIFT_OPERATOR(<<, <<=, shl)
+KFR_VEC_SHIFT_OPERATOR(>>, >>=, shr)
+
+KFR_VEC_CMP_OPERATOR(==, eq)
+KFR_VEC_CMP_OPERATOR(!=, ne)
+KFR_VEC_CMP_OPERATOR(>=, ge)
+KFR_VEC_CMP_OPERATOR(<=, le)
+KFR_VEC_CMP_OPERATOR(>, gt)
+KFR_VEC_CMP_OPERATOR(<, lt)
+
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+          KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator&(const mask<T1, N>& x, const mask<T2, N>& y)CMT_NOEXCEPT
+{
+    return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) & bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+          KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator|(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+    return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) | bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+          KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator&&(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+    return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) & bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+          KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator||(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+    return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) | bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+          KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator^(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+    return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) ^ bitcast<C>(vec<T2, N>(y.v))).v);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> operator~(const mask<T, N>& x) CMT_NOEXCEPT
+{
+    return ~x.asvec();
+}
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> operator!(const mask<T, N>& x) CMT_NOEXCEPT
+{
+    return ~x.asvec();
+}
+
+KFR_INTRINSIC float bitwisenot(float x) { return fbitcast(~ubitcast(x)); }
+KFR_INTRINSIC float bitwiseor(float x, float y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
+KFR_INTRINSIC float bitwiseand(float x, float y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
+KFR_INTRINSIC float bitwiseandnot(float x, float y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
+KFR_INTRINSIC float bitwisexor(float x, float y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
+KFR_INTRINSIC double bitwisenot(double x) { return fbitcast(~ubitcast(x)); }
+KFR_INTRINSIC double bitwiseor(double x, double y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
+KFR_INTRINSIC double bitwiseand(double x, double y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
+KFR_INTRINSIC double bitwiseandnot(double x, double y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
+KFR_INTRINSIC double bitwisexor(double x, double y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
+
+/// @brief Bitwise Not
+template <typename T1>
+KFR_INTRINSIC T1 bitwisenot(const T1& x)
+{
+    return ~x;
+}
+KFR_FN(bitwisenot)
+
+/// @brief Bitwise And
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwiseand(const T1& x, const T2& y)
+{
+    return x & y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T bitwiseand(initialvalue<T>)
+{
+    return constants<T>::allones();
+}
+KFR_FN(bitwiseand)
+
+/// @brief Bitwise And-Not
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y)
+{
+    return x & ~y;
+}
+template <typename T>
+constexpr inline T bitwiseandnot(initialvalue<T>)
+{
+    return constants<T>::allones();
+}
+KFR_FN(bitwiseandnot)
+
+/// @brief Bitwise Or
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwiseor(const T1& x, const T2& y)
+{
+    return x | y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T bitwiseor(initialvalue<T>)
+{
+    return subtype<T>(0);
+}
+KFR_FN(bitwiseor)
+
+/// @brief Bitwise Xor (Exclusive Or)
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwisexor(const T1& x, const T2& y)
+{
+    return x ^ y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T bitwisexor(initialvalue<T>)
+{
+    return subtype<T>();
+}
+KFR_FN(bitwisexor)
+
+/// @brief Bitwise Left shift
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 shl(const T1& left, const T2& right)
+{
+    return left << right;
+}
+KFR_FN(shl)
+
+/// @brief Bitwise Right shift
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 shr(const T1& left, const T2& right)
+{
+    return left >> right;
+}
+KFR_FN(shr)
+
+/// @brief Bitwise Left Rotate
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 rol(const T1& left, const T2& right)
+{
+    return shl(left, right) | shr(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
+}
+KFR_FN(rol)
+
+/// @brief Bitwise Right Rotate
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 ror(const T1& left, const T2& right)
+{
+    return shr(left, right) | shl(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
+}
+KFR_FN(ror)
+
+template <typename T>
+constexpr KFR_INTRINSIC T add(const T& x)
+{
+    return x;
+}
+
+/**
+ * @brief Returns sum of all the arguments passed to a function.
+ */
+template <typename T1, typename T2, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, T2, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, T2, Ts...> add(const T1& x, const T2& y, const Ts&... rest)
+{
+    return x + add(y, rest...);
+}
+template <typename T>
+constexpr KFR_INTRINSIC T add(initialvalue<T>)
+{
+    return T(0);
+}
+KFR_FN(add)
+
+/**
+ * @brief Returns template expression that returns sum of all the arguments passed to a function.
+ */
+template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)>
+KFR_INTRINSIC internal::expression_function<fn::add, E...> add(E&&... x)
+{
+    return { fn::add(), std::forward<E>(x)... };
+}
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> sub(const T1& x, const T2& y)
+{
+    return x - y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T sub(initialvalue<T>)
+{
+    return T(0);
+}
+KFR_FN(sub)
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sub, E1, E2> sub(E1&& x, E2&& y)
+{
+    return { fn::sub(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+template <typename T1>
+constexpr KFR_INTRINSIC T1 mul(const T1& x)
+{
+    return x;
+}
+
+/**
+ * @brief Returns product of all the arguments passed to a function.
+ */
+template <typename T1, typename T2, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, Ts...> mul(const T1& x, const T2& y, const Ts&... rest)
+{
+    return x * mul(y, rest...);
+}
+
+template <typename T>
+constexpr KFR_INTRINSIC T mul(initialvalue<T>)
+{
+    return T(1);
+}
+KFR_FN(mul)
+
+/**
+ * @brief Returns template expression that returns product of all the arguments passed to a function.
+ */
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mul, E...> mul(E&&... x)
+{
+    return { fn::mul(), std::forward<E>(x)... };
+}
+
+/**
+ * @brief Returns square of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+constexpr inline T1 sqr(const T1& x)
+{
+    return x * x;
+}
+KFR_FN(sqr)
+
+/**
+ * @brief Returns template expression that returns square of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sqr, E1> sqr(E1&& x)
+{
+    return { fn::sqr(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns cube of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+constexpr inline T1 cub(const T1& x)
+{
+    return sqr(x) * x;
+}
+KFR_FN(cub)
+
+/**
+ * @brief Returns template expression that returns cube of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::cub, E1> cub(E1&& x)
+{
+    return { fn::cub(), std::forward<E1>(x) };
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow2(const T& x)
+{
+    return sqr(x);
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow3(const T& x)
+{
+    return cub(x);
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow4(const T& x)
+{
+    return sqr(sqr(x));
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow5(const T& x)
+{
+    return pow4(x) * x;
+}
+KFR_FN(pow2)
+KFR_FN(pow3)
+KFR_FN(pow4)
+KFR_FN(pow5)
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow2, E1> pow2(E1&& x)
+{
+    return { fn::pow2(), std::forward<E1>(x) };
+}
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow3, E1> pow3(E1&& x)
+{
+    return { fn::pow3(), std::forward<E1>(x) };
+}
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow4, E1> pow4(E1&& x)
+{
+    return { fn::pow4(), std::forward<E1>(x) };
+}
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow5, E1> pow5(E1&& x)
+{
+    return { fn::pow5(), std::forward<E1>(x) };
+}
+
+/// Raise x to the power base \f$ x^{base} \f$
+/// @code
+/// CHECK( ipow( 10, 3 ) == 1000 );
+/// CHECK( ipow( 0.5, 2 ) == 0.25 );
+/// @endcode
+template <typename T>
+constexpr inline T ipow(const T& x, int base)
+{
+    T xx     = x;
+    T result = T(1);
+    while (base)
+    {
+        if (base & 1)
+            result *= xx;
+        base >>= 1;
+        xx *= xx;
+    }
+    return result;
+}
+KFR_FN(ipow)
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::ipow, E1, E2> ipow(E1&& x, E2&& b)
+{
+    return { fn::ipow(), std::forward<E1>(x), std::forward<E2>(b) };
+}
+
+/// Return square of the sum of all arguments
+/// @code
+/// CHECK(sqrsum(1,2,3) == 36);
+/// @endcode
+template <typename T1, typename... Ts>
+constexpr inline common_type<T1, Ts...> sqrsum(const T1& x, const Ts&... rest)
+{
+    return sqr(add(x, rest...));
+}
+
+template <typename T1, typename T2>
+constexpr inline common_type<T1, T2> sqrdiff(const T1& x, const T2& y)
+{
+    return sqr(x - y);
+}
+KFR_FN(sqrsum)
+KFR_FN(sqrdiff)
+
+/// Division
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout div(const T1& x, const T2& y)
+{
+    return static_cast<Tout>(x) / static_cast<Tout>(y);
+}
+KFR_FN(div)
+
+/// Remainder
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout rem(const T1& x, const T2& y)
+{
+    return static_cast<Tout>(x) % static_cast<Tout>(y);
+}
+KFR_FN(rem)
+
+/// Negation
+template <typename T1>
+inline T1 neg(const T1& x)
+{
+    return -x;
+}
+KFR_FN(neg)
+
+/// @brief Fused Multiply-Add
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> fmadd(const T1& x, const T2& y, const T3& z)
+{
+    return x * y + z;
+}
+/// @brief Fused Multiply-Sub
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> fmsub(const T1& x, const T2& y, const T3& z)
+{
+    return x * y - z;
+}
+KFR_FN(fmadd)
+KFR_FN(fmsub)
+
+/// @brief Linear blend of `x` and `y` (`c` must be in the range 0...+1)
+/// Returns `x + ( y - x ) * c`
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> mix(const T1& c, const T2& x, const T3& y)
+{
+    return fmadd(c, y - x, x);
+}
+
+/// @brief Linear blend of `x` and `y` (`c` must be in the range -1...+1)
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> mixs(const T1& c, const T2& x, const T3& y)
+{
+    return mix(fmadd(c, 0.5, 0.5), x, y);
+}
+KFR_FN(mix)
+KFR_FN(mixs)
+
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mix, E1, E2, E3> mix(E1&& c, E2&& x, E3&& y)
+{
+    return { fn::mix(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
+}
+
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mixs, E1, E2, E3> mixs(E1&& c, E2&& x, E3&& y)
+{
+    return { fn::mixs(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
+}
+
+namespace intrinsics
+{
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> horner(const T1&, const T2& c0)
+{
+    return c0;
+}
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner(const T1& x, const T2& c0, const T3& c1,
+                                                              const Ts&... values)
+{
+    return fmadd(horner(x, c1, values...), x, c0);
+}
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> horner_even(const T1&, const T2& c0)
+{
+    return c0;
+}
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner_even(const T1& x, const T2& c0, const T3& c2,
+                                                                   const Ts&... values)
+{
+    const T1 x2 = x * x;
+    return fmadd(horner(x2, c2, values...), x2, c0);
+}
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> horner_odd(const T1& x, const T2& c1)
+{
+    return c1 * x;
+}
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner_odd(const T1& x, const T2& c1, const T3& c3,
+                                                                  const Ts&... values)
+{
+    const T1 x2 = x * x;
+    return fmadd(horner(x2, c3, values...), x2, c1) * x;
+}
+} // namespace intrinsics
+
+/// @brief Calculate polynomial using Horner's method
+///
+/// ``horner(x, 1, 2, 3)`` is equivalent to \(3x^2 + 2x + 1\)
+template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, Ts...> horner(const T1& x, const Ts&... c)
+{
+    return intrinsics::horner(x, c...);
+}
+KFR_FN(horner)
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::horner, E...> horner(E&&... x)
+{
+    return { fn::horner(), std::forward<E>(x)... };
+}
+
+/// @brief Calculate polynomial using Horner's method (even powers)
+///
+/// ``horner_even(x, 1, 2, 3)`` is equivalent to \(3x^4 + 2x^2 + 1\)
+template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, Ts...> horner_even(const T1& x, const Ts&... c)
+{
+    return intrinsics::horner_even(x, c...);
+}
+KFR_FN(horner_even)
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::horner_even, E...> horner_even(E&&... x)
+{
+    return { fn::horner_even(), std::forward<E>(x)... };
+}
+
+/// @brief Calculate polynomial using Horner's method (odd powers)
+///
+/// ``horner_odd(x, 1, 2, 3)`` is equivalent to \(3x^5 + 2x^3 + 1x\)
+template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, Ts...> horner_odd(const T1& x, const Ts&... c)
+{
+    return intrinsics::horner_odd(x, c...);
+}
+KFR_FN(horner_odd)
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::horner_odd, E...> horner_odd(E&&... x)
+{
+    return { fn::horner_odd(), std::forward<E>(x)... };
+}
+
+/// @brief Calculate Multiplicative Inverse of `x`
+/// Returns `1/x`
+template <typename T>
+constexpr KFR_INTRINSIC T reciprocal(const T& x)
+{
+    static_assert(std::is_floating_point<subtype<T>>::value, "T must be floating point type");
+    return subtype<T>(1) / x;
+}
+KFR_FN(reciprocal)
+
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> mulsign(const T1& x, const T2& y)
+{
+    return bitwisexor(x, bitwiseand(y, special_constants<T2>::highbitmask()));
+}
+KFR_FN(mulsign)
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return (x & special_constants<T>::highbitmask()) | (y & special_constants<T>::highbitmask());
+}
+
+/// @brief Swap byte order
+template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)>
+KFR_INTRINSIC vec<T, N> swapbyteorder(const vec<T, N>& x)
+{
+    return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x)));
+}
+template <typename T, KFR_ENABLE_IF(sizeof(T) == 8)>
+KFR_INTRINSIC T swapbyteorder(const T& x)
+{
+    return reinterpret_cast<const T&>(__builtin_bswap64(reinterpret_cast<const u64&>(x)));
+}
+template <typename T, KFR_ENABLE_IF(sizeof(T) == 4)>
+KFR_INTRINSIC T swapbyteorder(const T& x)
+{
+    return reinterpret_cast<const T&>(__builtin_bswap32(reinterpret_cast<const u32&>(x)));
+}
+template <typename T, KFR_ENABLE_IF(sizeof(T) == 2)>
+KFR_INTRINSIC T swapbyteorder(const T& x)
+{
+    return reinterpret_cast<const T&>(__builtin_bswap16(reinterpret_cast<const u16&>(x)));
+}
+KFR_FN(swapbyteorder)
+
+template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
+KFR_INTRINSIC vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return blend<1, 0>(a + b, a - b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
+KFR_INTRINSIC vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+    return blend<0, 1>(a + b, a - b);
+}
+KFR_FN(subadd)
+KFR_FN(addsub)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> negeven(const vec<T, N>& x)
+{
+    return x ^ broadcast<N>(-T(), T());
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> negodd(const vec<T, N>& x)
+{
+    return x ^ broadcast<N>(T(), -T());
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::neg, E1> operator-(E1&& e1)
+{
+    return { fn::neg(), std::forward<E1>(e1) };
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwisenot, E1> operator~(E1&& e1)
+{
+    return { fn::bitwisenot(), std::forward<E1>(e1) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::add, E1, E2> operator+(E1&& e1, E2&& e2)
+{
+    return { fn::add(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sub, E1, E2> operator-(E1&& e1, E2&& e2)
+{
+    return { fn::sub(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mul, E1, E2> operator*(E1&& e1, E2&& e2)
+{
+    return { fn::mul(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::div, E1, E2> operator/(E1&& e1, E2&& e2)
+{
+    return { fn::div(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwiseand, E1, E2> operator&(E1&& e1, E2&& e2)
+{
+    return { fn::bitwiseand(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwiseor, E1, E2> operator|(E1&& e1, E2&& e2)
+{
+    return { fn::bitwiseor(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwisexor, E1, E2> operator^(E1&& e1, E2&& e2)
+{
+    return { fn::bitwisexor(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::shl, E1, E2> operator<<(E1&& e1, E2&& e2)
+{
+    return { fn::shl(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::shr, E1, E2> operator>>(E1&& e1, E2&& e2)
+{
+    return { fn::shr(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename T, size_t N1, size_t... Ns>
+vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest)
+{
+    const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...));
+    return t.v;
+}
+
+KFR_FN(packtranspose)
+
+template <typename T, size_t N>
+KFR_I_CE mask<T, N>::mask(const base& v) CMT_NOEXCEPT
+{
+    this->v = base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()).v;
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp
@@ -0,0 +1,286 @@
+/** @addtogroup types
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "types.hpp"
+
+namespace kfr
+{
+
+/// @brief An enumeration representing cpu instruction set
+enum class cpu_t : int
+{
+    generic = 0,
+#ifdef CMT_ARCH_X86
+    sse2    = 1,
+    sse3    = 2,
+    ssse3   = 3,
+    sse41   = 4,
+    sse42   = 5,
+    avx1    = 6,
+    avx2    = 7,
+    avx512  = 8, // F, CD, VL, DQ and BW
+    avx     = static_cast<int>(avx1),
+    lowest  = static_cast<int>(sse2),
+    highest = static_cast<int>(avx512),
+#endif
+#ifdef CMT_ARCH_ARM
+    neon    = 1,
+    neon64  = 2,
+    lowest  = static_cast<int>(neon),
+    highest = static_cast<int>(neon64),
+#endif
+    native = static_cast<int>(CMT_ARCH_NAME),
+
+#ifdef CMT_ARCH_AVX
+#define KFR_HAS_SECONDARY_PLATFORM
+    secondary = static_cast<int>(sse42),
+#else
+    secondary = static_cast<int>(native),
+#endif
+
+    common  = generic, // For compatibility
+    runtime = -1,
+};
+
+#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native
+
+template <cpu_t cpu>
+using ccpu_t = cval_t<cpu_t, cpu>;
+
+template <cpu_t cpu>
+constexpr ccpu_t<cpu> ccpu{};
+
+namespace internal_generic
+{
+constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); }
+constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); }
+
+#ifdef CMT_ARCH_X86
+constexpr auto cpu_list = cvals_t<cpu_t, cpu_t::avx512, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3,
+                                  cpu_t::sse3, cpu_t::sse2>();
+#else
+constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>;
+#endif
+} // namespace internal_generic
+
+template <cpu_t cpu>
+using cpuval_t = cval_t<cpu_t, cpu>;
+template <cpu_t cpu>
+constexpr auto cpuval = cpuval_t<cpu>{};
+
+constexpr auto cpu_all =
+    cfilter(internal_generic::cpu_list, internal_generic::cpu_list >= cpuval_t<cpu_t::native>());
+
+/// @brief Returns name of the cpu instruction set
+CMT_UNUSED static const char* cpu_name(cpu_t set)
+{
+#ifdef CMT_ARCH_X86
+    static const char* names[] = { "generic", "sse2", "sse3", "ssse3", "sse41",
+                                   "sse42",   "avx",  "avx2", "avx512" };
+#endif
+#ifdef CMT_ARCH_ARM
+    static const char* names[] = { "generic", "neon", "neon64" };
+#endif
+    if (set >= cpu_t::lowest && set <= cpu_t::highest)
+        return names[static_cast<size_t>(set)];
+    return "-";
+}
+
+#ifdef CMT_ARCH_X64
+template <int = 0>
+constexpr inline const char* bitness_const(const char*, const char* x64)
+{
+    return x64;
+}
+template <typename T>
+constexpr inline const T& bitness_const(const T&, const T& x64)
+{
+    return x64;
+}
+#else
+template <int = 0>
+constexpr inline const char* bitness_const(const char* x32, const char*)
+{
+    return x32;
+}
+template <typename T>
+constexpr inline const T& bitness_const(const T& x32, const T&)
+{
+    return x32;
+}
+#endif
+
+template <cpu_t c = cpu_t::native>
+struct platform;
+
+#ifdef CMT_ARCH_X86
+template <>
+struct platform<cpu_t::common>
+{
+    constexpr static size_t native_cache_alignment        = 64;
+    constexpr static size_t native_cache_alignment_mask   = native_cache_alignment - 1;
+    constexpr static size_t maximum_vector_alignment      = 64;
+    constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
+
+    constexpr static size_t simd_register_count = 1;
+
+    constexpr static size_t common_float_vector_size = 16;
+    constexpr static size_t common_int_vector_size   = 16;
+
+    constexpr static size_t minimum_float_vector_size = 16;
+    constexpr static size_t minimum_int_vector_size   = 16;
+
+    constexpr static size_t native_float_vector_size = 16;
+    constexpr static size_t native_int_vector_size   = 16;
+
+    constexpr static size_t native_vector_alignment      = 16;
+    constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+    constexpr static bool fast_unaligned = false;
+};
+template <>
+struct platform<cpu_t::sse2> : platform<cpu_t::common>
+{
+    constexpr static size_t simd_register_count = bitness_const(8, 16);
+};
+template <>
+struct platform<cpu_t::sse3> : platform<cpu_t::sse2>
+{
+};
+template <>
+struct platform<cpu_t::ssse3> : platform<cpu_t::sse3>
+{
+};
+template <>
+struct platform<cpu_t::sse41> : platform<cpu_t::ssse3>
+{
+};
+template <>
+struct platform<cpu_t::sse42> : platform<cpu_t::sse41>
+{
+};
+template <>
+struct platform<cpu_t::avx> : platform<cpu_t::sse42>
+{
+    constexpr static size_t native_float_vector_size = 32;
+
+    constexpr static size_t native_vector_alignment      = 32;
+    constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+    constexpr static bool fast_unaligned = true;
+};
+template <>
+struct platform<cpu_t::avx2> : platform<cpu_t::avx>
+{
+    constexpr static size_t native_int_vector_size = 32;
+};
+template <>
+struct platform<cpu_t::avx512> : platform<cpu_t::avx2>
+{
+    constexpr static size_t native_float_vector_size = 64;
+    constexpr static size_t native_int_vector_size   = 64;
+
+    constexpr static size_t native_vector_alignment      = 64;
+    constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+    constexpr static size_t simd_register_count = bitness_const(8, 32);
+};
+#endif
+#ifdef CMT_ARCH_ARM
+template <>
+struct platform<cpu_t::common>
+{
+    constexpr static size_t native_cache_alignment        = 64;
+    constexpr static size_t native_cache_alignment_mask   = native_cache_alignment - 1;
+    constexpr static size_t maximum_vector_alignment      = 16;
+    constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
+
+    constexpr static size_t simd_register_count = 1;
+
+    constexpr static size_t common_float_vector_size = 16;
+    constexpr static size_t common_int_vector_size   = 16;
+
+    constexpr static size_t minimum_float_vector_size = 16;
+    constexpr static size_t minimum_int_vector_size   = 16;
+
+    constexpr static size_t native_float_vector_size = 16;
+    constexpr static size_t native_int_vector_size   = 16;
+
+    constexpr static size_t native_vector_alignment      = 16;
+    constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+    constexpr static bool fast_unaligned = false;
+};
+template <>
+struct platform<cpu_t::neon> : platform<cpu_t::common>
+{
+};
+template <>
+struct platform<cpu_t::neon64> : platform<cpu_t::neon>
+{
+};
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief SIMD vector width for the given cpu instruction set
+template <typename T>
+constexpr static size_t vector_width =
+    (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::native_float_vector_size / sizeof(T)
+                                                      : platform<>::native_int_vector_size / sizeof(T)));
+
+template <typename T>
+constexpr static size_t minimum_vector_width =
+    (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::minimum_float_vector_size / sizeof(T)
+                                                      : platform<>::minimum_int_vector_size / sizeof(T)));
+
+template <typename T>
+constexpr static size_t vector_capacity = platform<>::simd_register_count* vector_width<T>;
+
+#ifdef CMT_COMPILER_MSVC
+template <typename T>
+constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_width<T> * 2);
+#else
+template <typename T>
+constexpr static size_t maximum_vector_size = const_min(
+    static_cast<size_t>(32), const_max(size_t(1), platform<>::simd_register_count / 4) * vector_width<T>);
+#endif
+
+template <typename T>
+constexpr static bool is_simd_size(size_t size)
+{
+    return is_poweroftwo(size) && size >= minimum_vector_width<T> && size <= vector_width<T>;
+}
+
+template <typename T, size_t N = vector_width<T>>
+struct vec;
+template <typename T, size_t N = vector_width<T>>
+struct mask;
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/read_write.hpp b/include/kfr/simd/read_write.hpp
@@ -0,0 +1,243 @@
+/** @addtogroup read_write
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "shuffle.hpp"
+#include "types.hpp"
+#include "vec.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <size_t N, bool A = false, typename T>
+KFR_INTRINSIC static vec<T, N> read(const T* src)
+{
+    return vec<T, N>(src, cbool_t<A>());
+}
+
+template <bool A = false, size_t N, typename T>
+KFR_INTRINSIC static void write(T* dest, const vec<T, N>& value)
+{
+    value.write(dest, cbool_t<A>());
+}
+
+template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
+KFR_INTRINSIC vec<T, Nout> gather(const T* base, size_t index, Indices... indices)
+{
+    return make_vector(base[index], base[indices]...);
+}
+
+template <size_t Index, size_t... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
+KFR_INTRINSIC vec<T, Nout> gather(const T* base)
+{
+    return make_vector(base[Index], base[Indices]...);
+}
+
+template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0>
+KFR_INTRINSIC void scatter(const T* base, const vec<T, N>& value)
+{
+    base[Index] = value[InIndex];
+    scatter<Indices..., T, N, InIndex + 1>(base, value);
+}
+
+namespace internal
+{
+template <typename T, size_t N, size_t... Indices>
+KFR_INTRINSIC vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>)
+{
+    return make_vector(base[indices[Indices]]...);
+}
+template <size_t Nout, size_t Stride, typename T, size_t... Indices>
+KFR_INTRINSIC vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>)
+{
+    return make_vector(base[Indices * Stride]...);
+}
+template <size_t Nout, size_t groupsize, typename T, size_t... Indices>
+KFR_INTRINSIC vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>)
+{
+    return make_vector(read<groupsize>(base + Indices * groupsize * stride)...);
+}
+} // namespace internal
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> gather(const T* base, const vec<u32, N>& indices)
+{
+    return internal::gather(base, indices, csizeseq<N>);
+}
+
+template <size_t Nout, size_t groupsize = 1, typename T>
+KFR_INTRINSIC vec<T, Nout * groupsize> gather_stride(const T* base, size_t stride)
+{
+    return internal::gather_stride_s<Nout, groupsize>(base, stride, csizeseq<Nout>);
+}
+
+template <size_t Nout, size_t Stride, typename T>
+KFR_INTRINSIC vec<T, Nout> gather_stride(const T* base)
+{
+    return internal::gather_stride<Nout, Stride>(base, csizeseq<Nout>);
+}
+
+template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices>
+KFR_INTRINSIC vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset,
+                                                  csizes_t<Indices...>)
+{
+    return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...);
+}
+template <size_t groupsize = 1, typename T, size_t N, typename IT>
+KFR_INTRINSIC vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
+{
+    return gather_helper<groupsize>(base, offset, csizeseq<N>);
+}
+
+template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices>
+KFR_INTRINSIC void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value,
+                                  csizes_t<Indices...>)
+{
+    swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)),
+              0)... };
+}
+template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, size_t... Indices>
+KFR_INTRINSIC void scatter_helper_s(T* base, size_t stride, const vec<T, Nout>& value, csizes_t<Indices...>)
+{
+    swallow{ (write(base + groupsize * stride, slice<Indices * groupsize, groupsize>(value)), 0)... };
+}
+template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
+KFR_INTRINSIC void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
+{
+    return scatter_helper<groupsize>(base, offset, value, csizeseq<N>);
+}
+
+template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
+KFR_INTRINSIC void scatter_stride(T* base, const vec<T, Nout>& value, size_t stride)
+{
+    return scatter_helper_s<groupsize>(base, stride, value, csizeseq<N>);
+}
+
+template <typename T, size_t groupsize = 1>
+struct stride_pointer : public stride_pointer<const T, groupsize>
+{
+    template <size_t N>
+    void write(const vec<T, N>& val, csize_t<N> = csize_t<N>())
+    {
+        kfr::scatter_stride<N, groupsize>(this->ptr, val);
+    }
+};
+
+template <typename T, size_t groupsize>
+struct stride_pointer<const T, groupsize>
+{
+    const T* ptr;
+    const size_t stride;
+
+    template <size_t N>
+    vec<T, N> read(csize_t<N> = csize_t<N>())
+    {
+        return kfr::gather_stride<N, groupsize>(ptr, stride);
+    }
+};
+
+template <typename T>
+constexpr T partial_masks[] = { constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                constants<T>::allones(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T(),
+                                T() };
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> partial_mask(size_t index)
+{
+    static_assert(N <= arraysize(partial_masks<T>) / 2,
+                  "N must not be greater than half of partial_masks array");
+    return read<N>(&partial_masks<T>[0] + arraysize(partial_masks<T>) / 2 - index);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> partial_mask(size_t index, vec_shape<T, N>)
+{
+    return partial_mask<T, N>(index);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/shuffle.hpp b/include/kfr/simd/shuffle.hpp
@@ -0,0 +1,569 @@
+/** @addtogroup shuffle
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "constants.hpp"
+#include "mask.hpp"
+#include "types.hpp"
+#include "vec.hpp"
+
+#include <tuple>
+#include <utility>
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec<T, Nout> low(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<Nout>);
+}
+
+template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec_shape<T, Nout> low(vec_shape<T, N>)
+{
+    return {};
+}
+
+template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec<T, Nout> high(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<Nout, prev_poweroftwo(N - 1)>);
+}
+
+template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec_shape<T, Nout> high(vec_shape<T, N>)
+{
+    return {};
+}
+
+template <typename T, size_t... Ns>
+KFR_INTRINSIC vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) CMT_NOEXCEPT
+{
+    return vec<T, csum<size_t, Ns...>()>(
+        intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, Ns>::scalar_size()...>(vs.v...));
+}
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC vec<T, N1 + N2> concat2(const vec<T, N1>& x, const vec<T, N2>& y) CMT_NOEXCEPT
+{
+    return vec<T, csum<size_t, N1, N2>()>(
+        intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N1>::scalar_size(),
+                                vec<T, N2>::scalar_size()>(x.v, y.v));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N * 4> concat4(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
+                                    const vec<T, N>& d) CMT_NOEXCEPT
+{
+    return intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N * 2>::scalar_size(),
+                                   vec<T, N * 2>::scalar_size()>(
+        intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(),
+                                vec<T, N>::scalar_size()>(a.v, b.v),
+        intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(),
+                                vec<T, N>::scalar_size()>(c.v, d.v));
+}
+
+template <size_t count, typename T, size_t N, size_t Nout = N* count>
+KFR_INTRINSIC vec<T, Nout> repeat(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<Nout> % csize<N>);
+}
+
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)>
+KFR_INTRINSIC vec<T, Nout> resize(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<Nout> % csize<N>);
+}
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout == N)>
+constexpr KFR_INTRINSIC const vec<T, Nout>& resize(const vec<T, N>& x)
+{
+    return x;
+}
+
+namespace intrinsics
+{
+
+template <typename T, typename... Ts, size_t... indices, size_t Nin = sizeof...(Ts),
+          size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC vec<T, Nout> broadcast_helper(csizes_t<indices...>, const Ts&... values)
+{
+    const std::tuple<Ts...> tup(values...);
+    return vec<T, Nout>(std::get<indices % Nin>(tup)...);
+}
+} // namespace intrinsics
+
+template <size_t Nout, typename... Ts, typename C = typename std::common_type<Ts...>::type>
+KFR_INTRINSIC vec<C, Nout> broadcast(const Ts&... values)
+{
+    return intrinsics::broadcast_helper<C>(csizeseq<Nout>, values...);
+}
+KFR_FN(broadcast)
+
+template <size_t Ncount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N + Ncount> padhigh(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<N + Ncount>);
+}
+KFR_FN(padhigh)
+
+template <size_t Ncount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N + Ncount> padlow(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<N + Ncount, 0 - Ncount>);
+}
+KFR_FN(padlow)
+
+template <size_t Nout, typename T>
+KFR_INTRINSIC vec<T, Nout> extend(const vec<T, 1>& x)
+{
+    return vec<T, Nout>(x.front());
+}
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N != Nout)>
+KFR_INTRINSIC vec<T, Nout> extend(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<Nout>);
+}
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N == Nout)>
+constexpr KFR_INTRINSIC const vec<T, Nout>& extend(const vec<T, N>& x)
+{
+    return x;
+}
+KFR_FN(extend)
+
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, count> slice(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<count, start>);
+}
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.shuffle(y, csizeseq<count, start>);
+}
+KFR_FN(slice)
+
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> replace(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.shuffle(y, csizeseq<N> + (csizeseq<N> >= csize<start> && csizeseq<N> < csize<start + count>)*N);
+}
+KFR_FN(replace)
+
+template <size_t, typename T, size_t N>
+KFR_INTRINSIC void split(const vec<T, N>&)
+{
+}
+template <size_t start = 0, typename T, size_t N, size_t Nout, typename... Args>
+KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args)
+{
+    out = x.shuffle(csizeseq<Nout, start>);
+    split<start + Nout>(x, std::forward<Args>(args)...);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, N / 2>& low, vec<T, N / 2>& high)
+{
+    low  = x.shuffle(csizeseq<N / 2, 0>);
+    high = x.shuffle(csizeseq<N / 2, N / 2>);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, N / 4>& w0, vec<T, N / 4>& w1, vec<T, N / 4>& w2,
+                         vec<T, N / 4>& w3)
+{
+    w0 = x.shuffle(csizeseq<N / 4, 0>);
+    w1 = x.shuffle(csizeseq<N / 4, N / 4>);
+    w2 = x.shuffle(csizeseq<N / 4, 2 * N / 4>);
+    w3 = x.shuffle(csizeseq<N / 4, 3 * N / 4>);
+}
+KFR_FN(split)
+
+template <size_t total, size_t number, typename T, size_t N, size_t Nout = N / total>
+KFR_INTRINSIC vec<T, Nout> part(const vec<T, N>& x)
+{
+    static_assert(N % total == 0, "N % total == 0");
+    return x.shuffle(csizeseq<Nout, number * Nout>);
+}
+KFR_FN(part)
+
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.shuffle(y, csizeseq<count, start>);
+}
+
+template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
+KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
+{
+    return x.shuffle(y.shuffle(csizeseq<N1>), csizeseq<N1 * 2>).shuffle(csizeseq<count, start>);
+}
+
+template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
+KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
+{
+    return x.shuffle(csizeseq<N2, -(N2 - N1)>)
+        .shuffle(y, csizeseq<N2 * 2>)
+        .shuffle(csizeseq<count, N2 - N1 + start>);
+}
+
+KFR_FN(concat_and_slice)
+
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout > N)>
+KFR_INTRINSIC vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T())
+{
+    static_assert(Nout > N, "Nout > N");
+    return concat(x, broadcast<Nout - N>(newvalue));
+}
+template <size_t Nout, typename T, typename TS>
+constexpr KFR_INTRINSIC const vec<T, Nout>& widen(const vec<T, Nout>& x, TS)
+{
+    return x;
+}
+KFR_FN(widen)
+
+template <size_t Nout, typename T, size_t N>
+KFR_INTRINSIC vec<T, Nout> narrow(const vec<T, N>& x)
+{
+    static_assert(Nout <= N, "Nout <= N");
+    return slice<0, Nout>(x);
+}
+KFR_FN(narrow)
+
+template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
+KFR_INTRINSIC vec<T, Nout> even(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(csizeseq<Nout / group, 0, 2>));
+}
+KFR_FN(even)
+
+template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
+KFR_INTRINSIC vec<T, Nout> odd(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(csizeseq<Nout / group, 1, 2>));
+}
+KFR_FN(odd)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> dupeven(const vec<T, N>& x)
+{
+    static_assert(N % 2 == 0, "N must be even");
+    return x.shuffle(csizeseq<N, 0, 1> & ~csize<1>);
+}
+KFR_FN(dupeven)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> dupodd(const vec<T, N>& x)
+{
+    static_assert(N % 2 == 0, "N must be even");
+    return x.shuffle(csizeseq<N, 0, 1> | csize<1>);
+}
+KFR_FN(dupodd)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N * 2> duphalfs(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq<N * 2> % csize<N>);
+}
+KFR_FN(duphalfs)
+
+template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y,
+                                elements_t<Indices...> i = elements_t<Indices...>())
+{
+    return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] +
+                            csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
+}
+KFR_FN(shuffle)
+
+template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y,
+                                      elements_t<Indices...> i = elements_t<Indices...>())
+{
+    return x.shuffle(y, scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
+                                     csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
+}
+KFR_FN(shufflegroups)
+
+template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
+{
+    return x.shuffle(i[csizeseq_t<N>() % csize_t<count>()] +
+                     csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
+}
+KFR_FN(permute)
+
+template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
+{
+    return x.shuffle(scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
+                                  csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
+}
+KFR_FN(permutegroups)
+
+namespace internal
+{
+
+template <typename T, size_t Nout, typename Fn, size_t... Indices>
+constexpr KFR_INTRINSIC vec<T, Nout> generate_vector(csizes_t<Indices...>)
+{
+    return make_vector(static_cast<T>(Fn()(Indices))...);
+}
+} // namespace internal
+
+template <typename T, size_t Nout, typename Fn>
+constexpr KFR_INTRINSIC vec<T, Nout> generate_vector()
+{
+    return internal::generate_vector<T, Nout, Fn>(cvalseq_t<size_t, Nout>());
+}
+KFR_FN(generate_vector)
+
+namespace internal
+{
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> evenmask()
+{
+    return broadcast<N>(maskbits<T>(true), maskbits<T>(false));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> oddmask()
+{
+    return broadcast<N>(maskbits<T>(false), maskbits<T>(true));
+}
+} // namespace internal
+
+template <typename T, size_t N, size_t Nout = N * 2>
+KFR_INTRINSIC vec<T, Nout> dup(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq_t<Nout>() / csize_t<2>());
+}
+KFR_FN(dup)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> duplow(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>());
+}
+KFR_FN(duplow)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> duphigh(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>() + csize_t<N - N / 2>());
+}
+KFR_FN(duphigh)
+
+template <size_t... Indices, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y,
+                              elements_t<Indices...> i = elements_t<Indices...>())
+{
+    return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] * csize_t<N>() + csizeseq_t<N>());
+}
+KFR_FN(blend)
+
+template <size_t elements = 2, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> swap(const vec<T, N>& x)
+{
+    return x.shuffle(csizeseq_t<N>() ^ csize_t<elements - 1>());
+}
+CMT_FN_TPL((size_t elements), (elements), swap)
+
+template <size_t shift, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi)
+{
+    return shift == 0 ? lo : (shift == N ? hi : hi.shuffle(lo, csizeseq_t<N, N - shift>()));
+}
+
+template <size_t amount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
+{
+    static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
+    return x.shuffle(csizeseq_t<N, N - amount>() % csize_t<N>());
+}
+KFR_FN(rotateright)
+
+template <size_t amount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
+{
+    static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
+    return x.shuffle(csizeseq_t<N, amount>() % csize_t<N>());
+}
+KFR_FN(rotateleft)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> insertright(T x, const vec<T, N>& y)
+{
+    return concat_and_slice<1, N>(y, vec<T, 1>(x));
+}
+KFR_FN(insertright)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> insertleft(T x, const vec<T, N>& y)
+{
+    return concat_and_slice<0, N>(vec<T, 1>(x), y);
+}
+KFR_FN(insertleft)
+
+template <size_t side1, size_t group = 1, typename T, size_t N, size_t size = N / group,
+          size_t side2 = size / side1, KFR_ENABLE_IF(size > 3)>
+KFR_INTRINSIC vec<T, N> transpose(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+                                  csizeseq_t<size>() / csize_t<side2>()));
+}
+template <size_t side, size_t group = 1, typename T, size_t N, KFR_ENABLE_IF(N / group <= 3)>
+KFR_INTRINSIC vec<T, N> transpose(const vec<T, N>& x)
+{
+    return x;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<vec<T, N>, N> transpose(const vec<vec<T, N>, N>& x)
+{
+    return vec<vec<T, N>, N>::from_flatten(transpose<2>(x.flatten()));
+}
+KFR_FN(transpose)
+
+template <size_t side2, size_t group = 1, typename T, size_t N, size_t size = N / group,
+          size_t side1 = size / side2, KFR_ENABLE_IF(size > 3)>
+KFR_INTRINSIC vec<T, N> transposeinverse(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+                                  csizeseq_t<size>() / csize_t<side2>()));
+}
+template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)>
+KFR_INTRINSIC vec<T, N> transposeinverse(const vec<T, N>& x)
+{
+    return x;
+}
+KFR_FN(transposeinverse)
+
+template <size_t side, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> ctranspose(const vec<T, N>& x)
+{
+    return transpose<side, 2>(x);
+}
+KFR_FN(ctranspose)
+
+template <size_t side, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> ctransposeinverse(const vec<T, N>& x)
+{
+    return transposeinverse<side, 2>(x);
+}
+KFR_FN(ctransposeinverse)
+
+template <size_t group = 1, typename T, size_t N, size_t Nout = N * 2, size_t size = Nout / group,
+          size_t side2 = 2, size_t side1 = size / side2>
+KFR_INTRINSIC vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y)
+{
+    return x.shuffle(y, scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+                                     csizeseq_t<size>() / csize_t<side2>()));
+}
+KFR_FN(interleave)
+
+template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side2 = 2,
+          size_t side1 = size / side2>
+KFR_INTRINSIC vec<T, N> interleavehalfs(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+                                  csizeseq_t<size>() / csize_t<side2>()));
+}
+KFR_FN(interleavehalfs)
+
+template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side1 = 2,
+          size_t side2 = size / side1>
+KFR_INTRINSIC vec<T, N> splitpairs(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+                                  csizeseq_t<size>() / csize_t<side2>()));
+}
+KFR_FN(splitpairs)
+
+template <size_t group = 1, typename T, size_t N, size_t size = N / group>
+KFR_INTRINSIC vec<T, N> reverse(const vec<T, N>& x)
+{
+    return x.shuffle(scale<group>(csizeseq_t<size, size - 1, -1>()));
+}
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC vec<vec<T, N1>, N2> reverse(const vec<vec<T, N1>, N2>& x)
+{
+    return swap<N1>(x.flatten()).v;
+}
+KFR_FN(reverse)
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y)
+{
+    static_assert(N2 <= N1, "N2 <= N1");
+    return x.shuffle(extend<N1>(y), (csizeseq_t<N1>() < csize_t<N2>()) * csize_t<N1>() + csizeseq_t<N1>());
+}
+KFR_FN(combine)
+
+namespace internal
+{
+template <size_t start, size_t stride>
+struct generate_index
+{
+    KFR_INTRINSIC constexpr size_t operator()(size_t index) const { return start + index * stride; }
+};
+template <size_t start, size_t size, int on, int off>
+struct generate_onoff
+{
+    KFR_INTRINSIC constexpr size_t operator()(size_t index) const
+    {
+        return index >= start && index < start + size ? on : off;
+    }
+};
+} // namespace internal
+
+template <typename T, size_t N, size_t start = 0, size_t stride = 1>
+constexpr KFR_INTRINSIC vec<T, N> enumerate()
+{
+    return generate_vector<T, N, internal::generate_index<start, stride>>();
+}
+template <size_t start = 0, size_t stride = 1, typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> enumerate(vec_shape<T, N>)
+{
+    return generate_vector<T, N, internal::generate_index<start, stride>>();
+}
+KFR_FN(enumerate)
+
+template <typename T, size_t N, size_t start = 0, size_t size = 1, int on = 1, int off = 0>
+constexpr KFR_INTRINSIC vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
+{
+    return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
+}
+template <size_t start = 0, size_t size = 1, int on = 1, int off = 0, typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> onoff(vec_shape<T, N>, cint_t<on> = cint_t<on>(),
+                                        cint_t<off> = cint_t<off>())
+{
+    return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
+}
+KFR_FN(onoff)
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+#define KFR_SHUFFLE_SPECIALIZATIONS 1
+#include "impl/specializations.i"
diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp
@@ -0,0 +1,372 @@
+/** @addtogroup types
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../kfr.h"
+
+#include "impl/intrinsics.h"
+
+#include <climits>
+
+#include <cmath>
+#include <limits>
+#include <random>
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers")
+
+#ifdef KFR_TESTING
+#include "../testo/testo.hpp"
+#endif
+
+#include "../cometa.hpp"
+#include "../cometa/numeric.hpp"
+
+namespace kfr
+{
+
+// Include all from CoMeta library
+using namespace cometa;
+
+using cometa::fbase;
+using cometa::fmax;
+
+// primary template (used for zero types)
+template <typename... T>
+struct common_type_impl
+{
+};
+
+template <typename... T>
+using decay_common = decay<common_type_impl<T...>>;
+
+template <typename T1, typename T2, template <typename TT> class result_type, typename = void>
+struct common_type_from_subtypes
+{
+};
+
+template <typename T1, typename T2, template <typename TT> class result_type>
+struct common_type_from_subtypes<T1, T2, result_type, void_t<typename common_type_impl<T1, T2>::type>>
+{
+    using type = result_type<typename common_type_impl<T1, T2>::type>;
+};
+
+template <typename T>
+struct common_type_impl<T>
+{
+    using type = decay<T>;
+};
+
+template <typename T1, typename T2>
+using common_for_two = decltype(false ? std::declval<T1>() : std::declval<T2>());
+
+template <typename T1, typename T2, typename = void>
+struct common_type_2_default
+{
+};
+
+template <typename T1, typename T2>
+struct common_type_2_default<T1, T2, void_t<common_for_two<T1, T2>>>
+{
+    using type = std::decay_t<common_for_two<T1, T2>>;
+};
+
+template <typename T1, typename T2, typename D1 = decay<T1>, typename D2 = decay<T2>>
+struct common_type_2_impl : common_type_impl<D1, D2>
+{
+};
+
+template <typename D1, typename D2>
+struct common_type_2_impl<D1, D2, D1, D2> : common_type_2_default<D1, D2>
+{
+};
+
+template <typename T1, typename T2>
+struct common_type_impl<T1, T2> : common_type_2_impl<T1, T2>
+{
+};
+
+template <typename AlwaysVoid, typename T1, typename T2, typename... R>
+struct common_type_multi_impl
+{
+};
+
+template <typename T1, typename T2, typename... R>
+struct common_type_multi_impl<void_t<typename common_type_impl<T1, T2>::type>, T1, T2, R...>
+    : common_type_impl<typename common_type_impl<T1, T2>::type, R...>
+{
+};
+
+template <typename T1, typename T2, typename... R>
+struct common_type_impl<T1, T2, R...> : common_type_multi_impl<void, T1, T2, R...>
+{
+};
+
+template <typename... T>
+using common_type = typename common_type_impl<T...>::type;
+
+constexpr ctypes_t<i8, i16, i32, i64> signed_types{};
+constexpr ctypes_t<u8, u16, u32, u64> unsigned_types{};
+constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64> integer_types{};
+constexpr ctypes_t<f32
+#ifdef KFR_NATIVE_F64
+                   ,
+                   f64
+#endif
+                   >
+    float_types{};
+constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64, f32
+#ifdef KFR_NATIVE_F64
+                   ,
+                   f64
+#endif
+                   >
+    numeric_types{};
+
+constexpr csizes_t<1, 2, 3, 4, 8, 16, 32, 64> test_vector_sizes{};
+
+template <template <typename, size_t> class vec_tpl, typename T,
+          typename sizes =
+#ifdef KFR_EXTENDED_TESTS
+              cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<64 / sizeof(T)>)>
+#else
+              csizes_t<1>
+#endif
+          >
+struct vector_types_for_size_t_impl;
+
+template <template <typename, size_t> class vec_tpl, typename T, size_t... sizes>
+struct vector_types_for_size_t_impl<vec_tpl, T, csizes_t<sizes...>>
+{
+    using type = ctypes_t<vec_tpl<T, sizes>...>;
+};
+
+template <template <typename, size_t> class vec_tpl, typename T>
+using vector_types_for_size_t = typename vector_types_for_size_t_impl<vec_tpl, T>::type;
+
+template <template <typename, size_t> class vec_tpl>
+using signed_vector_types_t =
+    concat_lists<vector_types_for_size_t<vec_tpl, i8>, vector_types_for_size_t<vec_tpl, i16>,
+                 vector_types_for_size_t<vec_tpl, i32>, vector_types_for_size_t<vec_tpl, i64>>;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr signed_vector_types_t<vec_tpl> signed_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+using unsigned_vector_types_t =
+    concat_lists<vector_types_for_size_t<vec_tpl, u8>, vector_types_for_size_t<vec_tpl, u16>,
+                 vector_types_for_size_t<vec_tpl, u32>, vector_types_for_size_t<vec_tpl, u64>>;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr unsigned_vector_types_t<vec_tpl> unsigned_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+using integer_vector_types_t = concat_lists<signed_vector_types_t<vec_tpl>, unsigned_vector_types_t<vec_tpl>>;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr integer_vector_types_t<vec_tpl> integer_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+using float_vector_types_t = concat_lists<vector_types_for_size_t<vec_tpl, f32>
+#ifdef KFR_NATIVE_F64
+                                          ,
+                                          vector_types_for_size_t<vec_tpl, f64>
+#endif
+                                          >;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr float_vector_types_t<vec_tpl> float_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+constexpr concat_lists<integer_vector_types_t<vec_tpl>, float_vector_types_t<vec_tpl>> numeric_vector_types{};
+
+struct u24
+{
+    u8 raw[3];
+};
+
+struct i24
+{
+    u8 raw[3];
+
+    constexpr i24(i32 x) CMT_NOEXCEPT : raw{}
+    {
+        raw[0] = x & 0xFF;
+        raw[1] = (x >> 8) & 0xFF;
+        raw[2] = (x >> 16) & 0xFF;
+    }
+
+    constexpr i32 as_int() const CMT_NOEXCEPT
+    {
+        return static_cast<i32>(raw[0]) | static_cast<i32>(raw[1] << 8) |
+               (static_cast<i32>(raw[2] << 24) >> 8);
+    }
+
+    operator int() const CMT_NOEXCEPT { return as_int(); }
+};
+
+struct f16
+{
+    u16 raw;
+};
+
+template <size_t bits>
+struct bitmask
+{
+    using type = conditional<(bits > 32), uint64_t,
+                             conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>;
+
+    bitmask(type val) : value(val) {}
+
+    type value;
+};
+
+template <typename T>
+struct maskbit
+{
+    bool value;
+};
+
+namespace fn_generic
+{
+///@copybrief cometa::pass_through
+using pass_through = cometa::fn_pass_through;
+
+///@copybrief cometa::noop
+using noop = cometa::fn_noop;
+
+///@copybrief cometa::get_first
+using get_first = cometa::fn_get_first;
+
+///@copybrief cometa::get_second
+using get_second = cometa::fn_get_second;
+
+///@copybrief cometa::get_third
+using get_third = cometa::fn_get_third;
+
+///@copybrief cometa::returns
+template <typename T>
+using returns = cometa::fn_returns<T>;
+} // namespace fn_generic
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes")
+
+template <typename T, bool A>
+struct struct_with_alignment
+{
+    using pointer       = struct_with_alignment*;
+    using const_pointer = const struct_with_alignment*;
+    T value;
+    KFR_MEM_INTRINSIC void operator=(T value) { this->value = value; }
+};
+
+#ifdef CMT_COMPILER_MSVC
+#define KFR_UNALIGNED_POINTER __unaligned
+#else
+#define KFR_UNALIGNED_POINTER
+#endif
+
+template <typename T>
+struct struct_with_alignment<T, false>
+{
+    using pointer       = KFR_UNALIGNED_POINTER struct_with_alignment*;
+    using const_pointer = KFR_UNALIGNED_POINTER const struct_with_alignment*;
+    T value;
+    KFR_MEM_INTRINSIC void operator=(T value) { this->value = value; }
+}
+#ifdef CMT_GNU_ATTRIBUTES
+__attribute__((__packed__, __may_alias__)) //
+#endif
+;
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+/// @brief Fills a value with zeros
+template <typename T1>
+KFR_INTRINSIC void zeroize(T1& value)
+{
+    builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1));
+}
+
+/// @brief Used to determine the initial value for reduce functions
+template <typename T>
+struct initialvalue
+{
+};
+
+template <typename T>
+struct is_simd_type
+    : std::integral_constant<
+          bool, std::is_same<T, float>::value || std::is_same<T, double>::value ||
+                    std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value ||
+                    std::is_same<T, short>::value || std::is_same<T, unsigned short>::value ||
+                    std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+                    std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
+                    std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value>
+{
+};
+
+template <typename T, size_t N>
+struct vec_shape
+{
+    using value_type = T;
+    constexpr static size_t size() CMT_NOEXCEPT { return N; }
+    constexpr vec_shape() CMT_NOEXCEPT = default;
+
+    using scalar_type = subtype<T>;
+    constexpr static size_t scalar_size() CMT_NOEXCEPT { return N * compound_type_traits<T>::width; }
+};
+
+constexpr size_t index_undefined = static_cast<size_t>(-1);
+
+struct czeros_t
+{
+};
+struct cones_t
+{
+};
+constexpr czeros_t czeros{};
+constexpr cones_t cones{};
+
+using caligned_t   = cbool_t<true>;
+using cunaligned_t = cbool_t<false>;
+
+constexpr caligned_t caligned{};
+constexpr cunaligned_t cunaligned{};
+
+#ifdef CMT_INTRINSICS_IS_CONSTEXPR
+#define KFR_I_CE constexpr
+#else
+#define KFR_I_CE
+#endif
+
+#define avoid_odr_use(x) static_cast<decltype(x)>(x)
+
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp
@@ -0,0 +1,1283 @@
+/** @addtogroup types
+ *  @{
+ */
+/*
+  Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+  This file is part of KFR
+
+  KFR is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  KFR is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with KFR.
+
+  If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+  Buying a commercial license is mandatory as soon as you develop commercial activities without
+  disclosing the source code of your own applications.
+  See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../version.hpp"
+#include "constants.hpp"
+#include "impl/backend.hpp"
+
+/**
+ *  @brief Internal macro for functions
+ */
+#define KFR_FN(FN)                                                                                           \
+    namespace fn                                                                                             \
+    {                                                                                                        \
+    struct FN                                                                                                \
+    {                                                                                                        \
+        template <typename... Args>                                                                          \
+        CMT_INLINE_MEMBER decltype(::kfr::FN(std::declval<Args>()...)) operator()(Args&&... args) const      \
+        {                                                                                                    \
+            return ::kfr::FN(std::forward<Args>(args)...);                                                   \
+        }                                                                                                    \
+    };                                                                                                       \
+    }
+
+/**
+ *  @brief Internal macro for functions
+ */
+#define KFR_I_FN(FN)                                                                                         \
+    namespace fn                                                                                             \
+    {                                                                                                        \
+    struct FN                                                                                                \
+    {                                                                                                        \
+        template <typename... Args>                                                                          \
+        CMT_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()(               \
+            Args&&... args) const                                                                            \
+        {                                                                                                    \
+            return ::kfr::intrinsics::FN(std::forward<Args>(args)...);                                       \
+        }                                                                                                    \
+    };                                                                                                       \
+    }
+
+#define KFR_I_FN_FULL(FN, FULLFN)                                                                            \
+    namespace fn                                                                                             \
+    {                                                                                                        \
+    struct FN                                                                                                \
+    {                                                                                                        \
+        template <typename... Args>                                                                          \
+        CMT_INLINE_MEMBER decltype(FULLFN(std::declval<Args>()...)) operator()(Args&&... args) const         \
+        {                                                                                                    \
+            return FULLFN(std::forward<Args>(args)...);                                                      \
+        }                                                                                                    \
+    };                                                                                                       \
+    }
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked")
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4814))
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, size_t N>
+struct alignas(next_poweroftwo(sizeof(T)) * next_poweroftwo(N)) portable_vec
+{
+    static constexpr vec_shape<T, N> shape() CMT_NOEXCEPT { return {}; }
+
+    static_assert(N > 0 && N <= 1024, "Invalid vector size");
+
+    static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
+
+    // type and size
+    using value_type = T;
+
+    constexpr static size_t size() CMT_NOEXCEPT { return N; }
+
+    T elem[N];
+};
+
+template <typename T, size_t N>
+struct vec;
+
+template <typename T, size_t N>
+struct mask;
+
+template <typename T, size_t N>
+struct vec_halves
+{
+    vec<T, prev_poweroftwo(N - 1)> low;
+    vec<T, N - prev_poweroftwo(N - 1)> high;
+};
+
+template <typename T>
+struct vec_halves<T, 1>
+{
+    T val;
+};
+
+namespace internal
+{
+
+// scalar to scalar
+template <typename To, typename From>
+struct conversion
+{
+    static_assert(std::is_convertible<From, To>::value, "");
+
+    static To cast(const From& value) { return value; }
+};
+
+template <typename T>
+struct compoundcast
+{
+    static vec<T, 1> to_flat(const T& x) { return vec<T, 1>(x); }
+    static T from_flat(const vec<T, 1>& x) { return x.front(); }
+};
+template <typename T, size_t N>
+struct compoundcast<vec<T, N>>
+{
+    static const vec<T, N>& to_flat(const vec<T, N>& x) { return x; }
+    static const vec<T, N>& from_flat(const vec<T, N>& x) { return x; }
+};
+template <typename T, size_t N1, size_t N2>
+struct compoundcast<vec<vec<T, N1>, N2>>
+{
+    static vec<T, N1 * N2> to_flat(const vec<vec<T, N1>, N2>& x) { return x; }
+    static vec<vec<T, N1>, N2> from_flat(const vec<T, N1 * N2>& x) { return x; }
+};
+} // namespace internal
+
+template <typename T, size_t N>
+struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<T>::deep_subtype,
+                                                  N * compound_type_traits<T>::deep_width>),
+                         const_min(size_t(platform<>::native_vector_alignment),
+                                   next_poweroftwo(sizeof(typename compound_type_traits<T>::deep_subtype) *
+                                                   N * compound_type_traits<T>::deep_width)))) vec
+{
+    static constexpr vec_shape<T, N> shape() CMT_NOEXCEPT { return {}; }
+
+    // type and size
+    using value_type = T;
+
+    constexpr static size_t size() CMT_NOEXCEPT { return N; }
+
+    using ST          = typename compound_type_traits<T>::deep_subtype;
+    using scalar_type = ST;
+
+    enum : size_t
+    {
+        SW = compound_type_traits<T>::deep_width,
+        SN = N * SW
+    };
+
+    constexpr static size_t scalar_size() CMT_NOEXCEPT { return SN; }
+
+    static_assert(is_simd_type<scalar_type>::value, "Invalid vector type");
+
+    static_assert(scalar_size() > 0 && scalar_size() <= 1024, "Invalid vector size");
+
+    using mask_t = mask<T, N>;
+
+    using simd_type    = intrinsics::simd<ST, SN>;
+    using uvalue_type  = utype<T>;
+    using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>;
+
+    using uscalar_type  = utype<ST>;
+    using iuscalar_type = conditional<is_i_class<ST>::value, ST, uscalar_type>;
+
+    using usimd_type  = intrinsics::simd<uscalar_type, SN>;
+    using iusimd_type = intrinsics::simd<iuscalar_type, SN>;
+
+    // constructors and assignment
+    // from SIMD
+    KFR_MEM_INTRINSIC vec(const simd_type& simd) CMT_NOEXCEPT : v(simd) {}
+    // default
+    KFR_MEM_INTRINSIC constexpr vec() CMT_NOEXCEPT = default;
+    // copy
+    KFR_MEM_INTRINSIC constexpr vec(const vec& value) CMT_NOEXCEPT = default;
+    // move
+    KFR_MEM_INTRINSIC constexpr vec(vec&&) CMT_NOEXCEPT = default;
+    // assignment
+    KFR_MEM_INTRINSIC constexpr vec& operator=(const vec&) CMT_NOEXCEPT = default;
+
+    // from scalar
+    template <typename U,
+              KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT
+        : v(intrinsics::simd_broadcast(intrinsics::simd_t<ST, SN>{}, static_cast<ST>(s)))
+    {
+    }
+    template <typename U,
+              KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT
+        : v(intrinsics::simd_shuffle(intrinsics::simd_t<ST, SW>{},
+                                     internal::compoundcast<T>::to_flat(static_cast<T>(s)).v,
+                                     csizeseq<SN> % csize<SW>, overload_auto))
+    {
+    }
+
+    // from list
+    template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT
+        : v(intrinsics::simd_make(ctype<T>, s0, s1, static_cast<value_type>(rest)...))
+    {
+    }
+    template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && !compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT
+        : v(intrinsics::simd_concat<ST, size_t(SW), size_t(SW), just_value<Us, size_t>(SW)...>(
+              internal::compoundcast<T>::to_flat(s0).v, internal::compoundcast<T>::to_flat(s1).v,
+              internal::compoundcast<T>::to_flat(static_cast<T>(rest)).v...))
+    {
+    }
+
+    // from vector of another type
+    template <typename U,
+              KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
+        : v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v))
+    {
+    }
+    template <typename U,
+              KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
+        : v(internal::conversion<vec<T, N>, vec<U, N>>::cast(x).v)
+    {
+    }
+
+    // from list of vectors
+    template <size_t... Ns, typename = enable_if<csum<size_t, Ns...>() == N>>
+    KFR_MEM_INTRINSIC vec(const vec<T, Ns>&... vs) CMT_NOEXCEPT
+        : v(intrinsics::simd_concat<ST, (SW * Ns)...>(vs.v...))
+    {
+    }
+
+    KFR_MEM_INTRINSIC vec(const portable_vec<T, N>& p) CMT_NOEXCEPT : vec(bitcast_anything<vec>(p)) {}
+
+    KFR_MEM_INTRINSIC operator portable_vec<T, N>() const CMT_NOEXCEPT
+    {
+        return bitcast_anything<portable_vec<T, N>>(*this);
+    }
+
+    KFR_MEM_INTRINSIC vec(czeros_t) CMT_NOEXCEPT : v(intrinsics::simd_zeros<ST, SN>()) {}
+
+    KFR_MEM_INTRINSIC vec(cones_t) CMT_NOEXCEPT : v(intrinsics::simd_allones<ST, SN>()) {}
+
+    template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
+    KFR_MEM_INTRINSIC static vec frombits(const vec<U, M>& v) CMT_NOEXCEPT
+    {
+        return intrinsics::simd_bitcast(
+            intrinsics::simd_cvt_t<ST, typename vec<U, M>::scalar_type, vec<U, M>::scalar_size()>{}, v.v);
+    }
+
+    // shuffle
+    template <size_t... indices>
+    KFR_MEM_INTRINSIC vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...> i) const CMT_NOEXCEPT
+    {
+        return vec<value_type, sizeof...(indices)>(
+            intrinsics::simd_shuffle(intrinsics::simd_t<ST, SN>{}, v, scale<SW>(i), overload_auto));
+    }
+
+    template <size_t... indices>
+    KFR_MEM_INTRINSIC vec<value_type, sizeof...(indices)> shuffle(const vec& y,
+                                                                  csizes_t<indices...> i) const CMT_NOEXCEPT
+    {
+        return vec<value_type, sizeof...(indices)>(
+            intrinsics::simd_shuffle(intrinsics::simd2_t<ST, SN, SN>{}, v, y.v, scale<SW>(i), overload_auto));
+    }
+
+    // element access
+    struct element;
+
+    KFR_MEM_INTRINSIC constexpr value_type operator[](size_t index) const& CMT_NOEXCEPT { return get(index); }
+
+    KFR_MEM_INTRINSIC constexpr value_type operator[](size_t index) && CMT_NOEXCEPT { return get(index); }
+
+    KFR_MEM_INTRINSIC constexpr element operator[](size_t index) & CMT_NOEXCEPT { return { *this, index }; }
+
+    KFR_MEM_INTRINSIC value_type front() const CMT_NOEXCEPT { return get(csize<0>); }
+
+    KFR_MEM_INTRINSIC value_type back() const CMT_NOEXCEPT { return get(csize<N - 1>); }
+
+    template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT
+    {
+        return intrinsics::simd_get_element<T, N>(v, index);
+    }
+    template <int dummy = 0, typename = void,
+              KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT
+    {
+        return this->s[index];
+    }
+
+    template <size_t index, KFR_ENABLE_IF(index < 1024 && compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT
+    {
+        return intrinsics::simd_get_element<T, N>(v, csize<index>);
+    }
+    template <size_t index, typename = void,
+              KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT
+    {
+        return internal::compoundcast<T>::from_flat(intrinsics::simd_shuffle(
+            intrinsics::simd_t<ST, SN>{}, v, csizeseq<SW, SW * index>, overload_auto));
+    }
+
+    template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT
+    {
+        v = intrinsics::simd_set_element<T, N>(v, index, s);
+    }
+    template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT
+    {
+        this->s[index] = s;
+    }
+
+    template <size_t index, KFR_ENABLE_IF(index < 1024 && compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT
+    {
+        v = intrinsics::simd_set_element<T, N>(v, csize<index>, s);
+    }
+    template <size_t index, typename = void,
+              KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)>
+    KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT
+    {
+        this->s[index] = s;
+    }
+
+    struct element
+    {
+        constexpr operator value_type() const CMT_NOEXCEPT { return v.get(index); }
+
+        KFR_MEM_INTRINSIC element& operator=(const value_type& s) CMT_NOEXCEPT
+        {
+            v.set(index, s);
+            return *this;
+        }
+
+        KFR_MEM_INTRINSIC element& operator=(const element& s) CMT_NOEXCEPT
+        {
+            v.set(index, static_cast<value_type>(s));
+            return *this;
+        }
+
+        template <typename U, size_t M>
+        KFR_MEM_INTRINSIC element& operator=(const typename vec<U, M>::element& s) CMT_NOEXCEPT
+        {
+            v.set(index, static_cast<value_type>(static_cast<U>(s)));
+            return *this;
+        }
+
+        vec& v;
+        size_t index;
+    };
+
+    // read/write
+    template <bool aligned = false>
+    KFR_MEM_INTRINSIC explicit constexpr vec(const value_type* src,
+                                             cbool_t<aligned> = cbool_t<aligned>()) CMT_NOEXCEPT
+        : v(intrinsics::simd_read<SN, aligned>(ptr_cast<ST>(src)))
+    {
+    }
+
+    template <bool aligned = false>
+    KFR_MEM_INTRINSIC const vec& write(value_type* dest,
+                                       cbool_t<aligned> = cbool_t<aligned>()) const CMT_NOEXCEPT
+    {
+        intrinsics::simd_write<aligned, SN>(ptr_cast<ST>(dest), v);
+        return *this;
+    }
+
+    KFR_MEM_INTRINSIC vec<ST, SN> flatten() const CMT_NOEXCEPT { return v; }
+    KFR_MEM_INTRINSIC static vec from_flatten(const vec<ST, SN>& x) { return vec(x.v); }
+
+    KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); }
+
+    constexpr static size_t simd_element_size  = const_min(vector_width<T>, N);
+    constexpr static size_t simd_element_count = N / simd_element_size;
+    using simd_element_type                    = simd<ST, simd_element_size>;
+
+public:
+    union {
+        simd_type v;
+        vec_halves<T, N> h;
+        simd_element_type w[simd_element_count];
+        T s[N];
+    };
+};
+
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevector(const vec<T, N>& x,
+                                                       csizes_t<indices...> i) CMT_NOEXCEPT
+{
+    return intrinsics::simd_shuffle(intrinsics::simd_t<T, N>{}, x.v, i, overload_auto);
+}
+
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevectors(const vec<T, N>& x, const vec<T, N>& y,
+                                                        csizes_t<indices...> i) CMT_NOEXCEPT
+{
+    return intrinsics::simd_shuffle(intrinsics::simd2_t<T, N, N>{}, x.v, y.v, i, overload_auto);
+}
+
+namespace internal
+{
+
+#if 0
+constexpr inline size_t scale_get_index(size_t counter, size_t groupsize, size_t index) CMT_NOEXCEPT
+{
+    return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index);
+}
+
+#ifdef CMT_COMPILER_MSVC
+template <size_t counter, size_t groupsize, size_t... indices>
+constexpr inline size_t scale_get_index(csizes_t<indices...>) CMT_NOEXCEPT
+{
+    return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>()));
+}
+
+template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)>
+constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt) CMT_NOEXCEPT
+    -> csizes_t<scale_get_index<counter, groupsize>(ind)...>
+{
+    return {};
+}
+#else
+
+template <size_t counter, size_t groupsize, size_t... indices>
+constexpr inline size_t scale_get_index() CMT_NOEXCEPT
+{
+    return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>()));
+}
+
+template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)>
+constexpr inline auto scale_impl(csizes_t<indices...>, csizes_t<counter...>) CMT_NOEXCEPT
+    -> csizes_t<scale_get_index<counter, groupsize, indices...>()...>
+{
+    return {};
+}
+
+#endif
+#endif
+
+} // namespace internal
+
+template <size_t groupsize, size_t... indices>
+constexpr inline auto scale() CMT_NOEXCEPT
+{
+    return cconcat(csizeseq<groupsize, groupsize * indices>...);
+    //    return internal::scale_impl(csizes_t<indices...>(), csizeseq<sizeof...(indices) * groupsize>);
+}
+
+namespace internal
+{
+template <typename T>
+struct is_vec_impl : std::false_type
+{
+};
+
+template <typename T, size_t N>
+struct is_vec_impl<vec<T, N>> : std::true_type
+{
+};
+} // namespace internal
+
+template <typename T>
+using is_vec = internal::is_vec_impl<T>;
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast")
+
+template <size_t N, typename T>
+constexpr KFR_INTRINSIC vec<T, N> broadcast(T x)
+{
+    return x;
+}
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+namespace internal
+{
+
+template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>,
+          size_t Nout = (N * compound_type_traits<To>::deep_width)>
+constexpr KFR_INTRINSIC vec<To, N> builtin_convertvector(const vec<From, N>& value) CMT_NOEXCEPT
+{
+    return vec<To, N>(value);
+}
+
+// vector to vector
+template <typename To, typename From, size_t N, size_t N2>
+struct conversion<vec<To, N>, vec<From, N2>>
+{
+    static_assert(N == N2, "");
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+
+    static vec<To, N> cast(const vec<From, N>& value) { return vec<To, N>(value); }
+};
+
+// scalar to vector
+template <typename To, typename From, size_t N>
+struct conversion<vec<To, N>, From>
+{
+    static_assert(std::is_convertible<From, To>::value, "");
+
+    static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
+};
+} // namespace internal
+
+template <typename T>
+constexpr size_t size_of() CMT_NOEXCEPT
+{
+    return sizeof(deep_subtype<T>) * compound_type_traits<T>::deep_width;
+}
+
+template <typename From, size_t N, typename Tsub = deep_subtype<From>,
+          size_t Nout = N* size_of<From>() / size_of<Tsub>()>
+constexpr KFR_INTRINSIC vec<Tsub, Nout> flatten(const vec<From, N>& x) CMT_NOEXCEPT
+{
+    return x.flatten();
+}
+
+template <typename To, typename From,
+          typename Tout = typename compound_type_traits<From>::template deep_rebind<To>>
+constexpr KFR_INTRINSIC Tout cast(const From& value) CMT_NOEXCEPT
+{
+    return static_cast<Tout>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N> cast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+    return vec<Tout, N>(value);
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<vec<Tout, N1>, N2> cast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+    return vec<vec<Tout, N1>, N2>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<Tin, N>& cast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+    return value;
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<vec<Tin, N1>, N2>& cast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+    return value;
+}
+
+//
+
+template <typename To, typename From,
+          typename Tout = typename compound_type_traits<From>::template deep_rebind<To>>
+constexpr KFR_INTRINSIC Tout innercast(const From& value) CMT_NOEXCEPT
+{
+    return static_cast<Tout>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N> innercast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+    return vec<Tout, N>(value);
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<vec<Tout, N1>, N2> innercast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+    return vec<vec<Tout, N1>, N2>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<Tin, N>& innercast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+    return value;
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<vec<Tin, N1>, N2>& innercast(const vec<vec<Tin, N1>, N2>& value)
+    CMT_NOEXCEPT
+{
+    return value;
+}
+
+//
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N> elemcast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+    return vec<Tout, N>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<Tin, N>& elemcast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+    return value;
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N2> elemcast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+    return vec<Tout, N2>(value);
+}
+
+template <typename To, typename From>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC To bitcast(const From& value) CMT_NOEXCEPT
+{
+    static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types");
+    union {
+        From from;
+        To to;
+    } u{ value };
+    return u.to;
+}
+
+template <typename To, typename From, size_t N, size_t Nout = (N * size_of<From>() / size_of<To>())>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<To, Nout> bitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+    return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To ubitcast(const From& value) CMT_NOEXCEPT
+{
+    return bitcast<To>(value);
+}
+
+template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To ibitcast(const From& value) CMT_NOEXCEPT
+{
+    return bitcast<To>(value);
+}
+
+template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To fbitcast(const From& value) CMT_NOEXCEPT
+{
+    return bitcast<To>(value);
+}
+
+template <typename From, typename To = uitype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To uibitcast(const From& value) CMT_NOEXCEPT
+{
+    return bitcast<To>(value);
+}
+
+template <typename From, size_t N, typename To = utype<From>,
+          size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> ubitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+    return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, size_t N, typename To = itype<From>,
+          size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> ibitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+    return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, size_t N, typename To = ftype<From>,
+          size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> fbitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+    return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, size_t N, typename To = uitype<From>,
+          size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> uibitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+    return vec<To, Nout>::frombits(value);
+}
+
+constexpr KFR_INTRINSIC size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
+
+template <typename T, size_t N>
+struct pkd_vec
+{
+    constexpr pkd_vec() CMT_NOEXCEPT {}
+
+    pkd_vec(const vec<T, N>& value) CMT_NOEXCEPT { value.write(v); }
+
+    template <typename... Ts>
+    constexpr pkd_vec(Ts... init) CMT_NOEXCEPT : v{ static_cast<T>(init)... }
+    {
+        static_assert(N <= sizeof...(Ts), "Too few initializers for pkd_vec");
+    }
+
+private:
+    T v[N];
+    friend struct vec<T, N>;
+}
+#ifdef CMT_GNU_ATTRIBUTES
+__attribute__((packed))
+#endif
+;
+
+namespace internal
+{
+
+template <size_t, typename T>
+constexpr KFR_INTRINSIC T make_vector_get_n()
+{
+    return T();
+}
+
+template <size_t index, typename T, typename... Args>
+constexpr KFR_INTRINSIC T make_vector_get_n(const T& arg, const Args&... args)
+{
+    return index == 0 ? arg : make_vector_get_n<index - 1, T>(args...);
+}
+
+template <typename T, typename... Args, size_t... indices, size_t N = sizeof...(Args)>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> make_vector_impl(csizes_t<indices...>, const Args&... args)
+{
+    static_assert(sizeof...(indices) == sizeof...(Args), "");
+    const T list[] = { static_cast<T>(args)... };
+    return vec<T, N>(list[indices]...);
+}
+} // namespace internal
+
+/// Create vector from scalar values
+/// @code
+/// CHECK( make_vector( 1, 2, 3, 4 ) == i32x4{1, 2, 3, 4} );
+/// @endcode
+template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
+          typename SubType = fix_type<conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>>
+constexpr KFR_INTRINSIC vec<SubType, N> make_vector(const Arg& x, const Args&... rest)
+{
+    //    static_assert(! is_same<SubType, unsigned long long>::value, "!!!--1");
+    //    static_assert(! is_same<fix_type<SubType>, unsigned long long>::value, "!!!--2");
+    return internal::make_vector_impl<SubType>(cvalseq_t<size_t, N>(), static_cast<SubType>(x),
+                                               static_cast<SubType>(rest)...);
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> make_vector(const vec<T, N>& x)
+{
+    return x;
+}
+
+template <typename T, T... Values, size_t N = sizeof...(Values)>
+constexpr KFR_INTRINSIC vec<T, N> make_vector(cvals_t<T, Values...>)
+{
+    return make_vector<T>(Values...);
+}
+
+template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
+          typename SubType = fix_type<conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>,
+          KFR_ENABLE_IF(is_number<subtype<SubType>>::value)>
+constexpr KFR_INTRINSIC vec<SubType, N> pack(const Arg& x, const Args&... rest)
+{
+    return internal::make_vector_impl<SubType>(csizeseq<N>, static_cast<SubType>(x),
+                                               static_cast<SubType>(rest)...);
+}
+
+using f32x1  = vec<f32, 1>;
+using f32x2  = vec<f32, 2>;
+using f32x3  = vec<f32, 3>;
+using f32x4  = vec<f32, 4>;
+using f32x8  = vec<f32, 8>;
+using f32x16 = vec<f32, 16>;
+using f32x32 = vec<f32, 32>;
+using f32x64 = vec<f32, 64>;
+using f64x1  = vec<f64, 1>;
+using f64x2  = vec<f64, 2>;
+using f64x3  = vec<f64, 3>;
+using f64x4  = vec<f64, 4>;
+using f64x8  = vec<f64, 8>;
+using f64x16 = vec<f64, 16>;
+using f64x32 = vec<f64, 32>;
+using f64x64 = vec<f64, 64>;
+using i8x1   = vec<i8, 1>;
+using i8x2   = vec<i8, 2>;
+using i8x3   = vec<i8, 3>;
+using i8x4   = vec<i8, 4>;
+using i8x8   = vec<i8, 8>;
+using i8x16  = vec<i8, 16>;
+using i8x32  = vec<i8, 32>;
+using i8x64  = vec<i8, 64>;
+using i16x1  = vec<i16, 1>;
+using i16x2  = vec<i16, 2>;
+using i16x3  = vec<i16, 3>;
+using i16x4  = vec<i16, 4>;
+using i16x8  = vec<i16, 8>;
+using i16x16 = vec<i16, 16>;
+using i16x32 = vec<i16, 32>;
+using i16x64 = vec<i16, 64>;
+using i32x1  = vec<i32, 1>;
+using i32x2  = vec<i32, 2>;
+using i32x3  = vec<i32, 3>;
+using i32x4  = vec<i32, 4>;
+using i32x8  = vec<i32, 8>;
+using i32x16 = vec<i32, 16>;
+using i32x32 = vec<i32, 32>;
+using i32x64 = vec<i32, 64>;
+using i64x1  = vec<i64, 1>;
+using i64x2  = vec<i64, 2>;
+using i64x3  = vec<i64, 3>;
+using i64x4  = vec<i64, 4>;
+using i64x8  = vec<i64, 8>;
+using i64x16 = vec<i64, 16>;
+using i64x32 = vec<i64, 32>;
+using i64x64 = vec<i64, 64>;
+using u8x1   = vec<u8, 1>;
+using u8x2   = vec<u8, 2>;
+using u8x3   = vec<u8, 3>;
+using u8x4   = vec<u8, 4>;
+using u8x8   = vec<u8, 8>;
+using u8x16  = vec<u8, 16>;
+using u8x32  = vec<u8, 32>;
+using u8x64  = vec<u8, 64>;
+using u16x1  = vec<u16, 1>;
+using u16x2  = vec<u16, 2>;
+using u16x3  = vec<u16, 3>;
+using u16x4  = vec<u16, 4>;
+using u16x8  = vec<u16, 8>;
+using u16x16 = vec<u16, 16>;
+using u16x32 = vec<u16, 32>;
+using u16x64 = vec<u16, 64>;
+using u32x1  = vec<u32, 1>;
+using u32x2  = vec<u32, 2>;
+using u32x3  = vec<u32, 3>;
+using u32x4  = vec<u32, 4>;
+using u32x8  = vec<u32, 8>;
+using u32x16 = vec<u32, 16>;
+using u32x32 = vec<u32, 32>;
+using u32x64 = vec<u32, 64>;
+using u64x1  = vec<u64, 1>;
+using u64x2  = vec<u64, 2>;
+using u64x3  = vec<u64, 3>;
+using u64x4  = vec<u64, 4>;
+using u64x8  = vec<u64, 8>;
+using u64x16 = vec<u64, 16>;
+using u64x32 = vec<u64, 32>;
+using u64x64 = vec<u64, 64>;
+
+namespace glsl_names
+{
+using vec2  = f32x2;
+using vec3  = f32x3;
+using vec4  = f32x4;
+using dvec2 = f64x2;
+using dvec3 = f64x3;
+using dvec4 = f64x4;
+using ivec2 = i32x2;
+using ivec3 = i32x3;
+using ivec4 = i32x4;
+using uvec2 = u32x2;
+using uvec3 = u32x3;
+using uvec4 = u32x4;
+} // namespace glsl_names
+namespace opencl_names
+{
+using char2   = i8x2;
+using char3   = i8x3;
+using char4   = i8x4;
+using char8   = i8x8;
+using char16  = i8x16;
+using uchar2  = u8x2;
+using uchar3  = u8x3;
+using uchar4  = u8x4;
+using uchar8  = u8x8;
+using uchar16 = u8x16;
+
+using short2   = i16x2;
+using short3   = i16x3;
+using short4   = i16x4;
+using short8   = i16x8;
+using short16  = i16x16;
+using ushort2  = u16x2;
+using ushort3  = u16x3;
+using ushort4  = u16x4;
+using ushort8  = u16x8;
+using ushort16 = u16x16;
+
+using int2   = i32x2;
+using int3   = i32x3;
+using int4   = i32x4;
+using int8   = i32x8;
+using int16  = i32x16;
+using uint2  = u32x2;
+using uint3  = u32x3;
+using uint4  = u32x4;
+using uint8  = u32x8;
+using uint16 = u32x16;
+
+using long2   = i64x2;
+using long3   = i64x3;
+using long4   = i64x4;
+using long8   = i64x8;
+using long16  = i64x16;
+using ulong2  = u64x2;
+using ulong3  = u64x3;
+using ulong4  = u64x4;
+using ulong8  = u64x8;
+using ulong16 = u64x16;
+
+using float2  = f32x2;
+using float3  = f32x3;
+using float4  = f32x4;
+using float8  = f32x8;
+using float16 = f32x16;
+
+using double2  = f64x2;
+using double3  = f64x3;
+using double4  = f64x4;
+using double8  = f64x8;
+using double16 = f64x16;
+} // namespace opencl_names
+
+namespace internal
+{
+
+template <size_t Index, typename T, size_t N, typename Fn, typename... Args,
+          typename Tout = result_of<Fn(subtype<decay<Args>>...)>>
+constexpr KFR_INTRINSIC Tout applyfn_helper(Fn&& fn, Args&&... args)
+{
+    return fn(args[Index]...);
+}
+
+template <typename T, size_t N, typename Fn, typename... Args,
+          typename Tout = result_of<Fn(subtype<decay<Args>>...)>, size_t... Indices>
+constexpr KFR_INTRINSIC vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args)
+{
+    return make_vector(applyfn_helper<Indices, T, N>(std::forward<Fn>(fn), std::forward<Args>(args)...)...);
+}
+
+template <typename T, size_t N, typename Fn, size_t... Indices>
+constexpr KFR_INTRINSIC vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
+{
+    return make_vector(((void)Indices, void(), fn())...);
+}
+} // namespace internal
+
+template <typename T, size_t N, typename Fn, typename... Args,
+          typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
+constexpr KFR_INTRINSIC vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args)
+{
+    return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>, arg, std::forward<Args>(args)...);
+}
+
+template <typename T, typename Fn, typename... Args, typename Tout = result_of<Fn(T, decay<Args>...)>,
+          KFR_ENABLE_IF(is_same<T, subtype<T>>::value)>
+constexpr KFR_INTRINSIC Tout apply(Fn&& fn, const T& arg, Args&&... args)
+{
+    return fn(arg, args...);
+}
+
+template <size_t N, typename Fn, typename T = result_of<Fn()>>
+constexpr KFR_INTRINSIC vec<T, N> apply(Fn&& fn)
+{
+    return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector()
+{
+    return vec<T, N>(czeros);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector(vec_shape<T, N>)
+{
+    return vec<T, N>(czeros);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector(vec<T, N>)
+{
+    return vec<T, N>(czeros);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector()
+{
+    return vec<T, N>(cones);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector(vec_shape<T, N>)
+{
+    return vec<T, N>(cones);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector(vec<T, N>)
+{
+    return vec<T, N>(cones);
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> undefinedvector()
+{
+    return vec<T, N>{};
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> undefinedvector(vec_shape<T, N>)
+{
+    return undefinedvector<T, N>();
+}
+
+template <size_t N>
+struct vec_template
+{
+    template <typename T>
+    using type = vec<T, N>;
+};
+
+#ifdef KFR_TESTING
+
+inline const std::vector<special_value>& special_values()
+{
+    static const std::vector<special_value> values{ special_constant::infinity,
+                                                    special_constant::neg_infinity,
+                                                    special_constant::min,
+                                                    special_constant::lowest,
+                                                    special_constant::max,
+                                                    3.1415926535897932384626433832795,
+                                                    4.499999,
+                                                    4.500001,
+                                                    -4.499999,
+                                                    -4.500001,
+                                                    0.1111111111111111111111111111111,
+                                                    -0.4444444444444444444444444444444,
+                                                    -1,
+                                                    0,
+                                                    +1 };
+    return values;
+}
+
+namespace test_catogories
+{
+constexpr cint_t<1> scalars{};
+constexpr cint_t<2> vectors{};
+constexpr cint_t<3> all{};
+
+constexpr inline auto types(cint_t<0>) { return ctypes_t<>{}; }
+constexpr inline auto types(cint_t<1>) { return cconcat(numeric_types); }
+constexpr inline auto types(cint_t<2>) { return cconcat(numeric_vector_types<vec>); }
+constexpr inline auto types(cint_t<3>) { return cconcat(numeric_types, numeric_vector_types<vec>); }
+
+} // namespace test_catogories
+
+template <typename T, size_t N, size_t... indices>
+vec<T, N> test_enumerate(vec_shape<T, N>, csizes_t<indices...>, double start = 0, double step = 1)
+{
+    return make_vector<T>(static_cast<T>(start + step * indices)...);
+}
+
+template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_return_constant<bool, true>>
+void test_function1(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{})
+{
+    testo::matrix(
+        named("type") = test_catogories::types(cat), named("value") = special_values(),
+        [&](auto type, special_value value) {
+            using T = type_of<decltype(type)>;
+            if (isapplicable(ctype<T>, value))
+            {
+                const T x(value);
+                CHECK(std::is_same<decltype(fn(x)), typename compound_type_traits<T>::template rebind<
+                                                        decltype(reffn(std::declval<subtype<T>>()))>>::value);
+                CHECK(fn(x) == apply(reffn, x));
+            }
+        });
+
+    testo::matrix(named("type") = test_catogories::types(cint<Cat & ~1>), [&](auto type) {
+        using T   = type_of<decltype(type)>;
+        const T x = test_enumerate(T::shape(), csizeseq<T::size()>, 0);
+        CHECK(fn(x) == apply(reffn, x));
+    });
+}
+
+template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_return_constant<bool, true>>
+void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{})
+{
+    testo::matrix(
+        named("type")   = test_catogories::types(cat),
+        named("value1") = special_values(), //
+        named("value2") = special_values(), [&](auto type, special_value value1, special_value value2) {
+            using T = type_of<decltype(type)>;
+            const T x1(value1);
+            const T x2(value2);
+            if (isapplicable(ctype<T>, value1, value2))
+            {
+                CHECK(std::is_same<decltype(fn(x1, x2)),
+                                   typename compound_type_traits<T>::template rebind<decltype(reffn(
+                                       std::declval<subtype<T>>(), std::declval<subtype<T>>()))>>::value);
+                CHECK(fn(x1, x2) == apply(reffn, x1, x2));
+            }
+        });
+
+    testo::matrix(named("type") = test_catogories::types(cint<Cat & ~1>), [&](auto type) {
+        using T    = type_of<decltype(type)>;
+        const T x1 = test_enumerate(T::shape(), csizeseq<T::size()>, 0, 1);
+        const T x2 = test_enumerate(T::shape(), csizeseq<T::size()>, 100, -1);
+        CHECK(fn(x1, x2) == apply(reffn, x1, x2));
+    });
+}
+
+#endif
+
+namespace internal
+{
+// vector<vector> to vector<vector>
+template <typename To, typename From, size_t N1, size_t N2, size_t Ns1>
+struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>>
+{
+    static_assert(N1 == Ns1, "");
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<vec<To, N1>, N2> cast(const vec<From, N1>& value)
+    {
+        return vec<vec<To, N1>, N2>::from_flatten(
+            kfr::innercast<To>(value.flatten())
+                .shuffle(csizeseq<N2 * vec<From, N1>::scalar_size()> % csize<N2>));
+    }
+};
+// vector<vector> to vector<vector>
+template <typename To, typename From, size_t N1, size_t N2, size_t NN1, size_t NN2>
+struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, NN1>, NN2>>
+{
+    static_assert(N1 == NN1, "");
+    static_assert(N2 == NN2, "");
+    static_assert(!is_compound<To>::value, "");
+    static_assert(!is_compound<From>::value, "");
+    static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value)
+    {
+        return vec<vec<To, N1>, N2>::from_flatten(kfr::innercast<To>(value.flatten()));
+    }
+};
+} // namespace internal
+
+template <typename T, size_t N1, size_t N2 = N1>
+using mat = vec<vec<T, N1>, N2>;
+
+using u8x2x2  = vec<vec<u8, 2>, 2>;
+using i8x2x2  = vec<vec<i8, 2>, 2>;
+using u16x2x2 = vec<vec<u16, 2>, 2>;
+using i16x2x2 = vec<vec<i16, 2>, 2>;
+using u32x2x2 = vec<vec<u32, 2>, 2>;
+using i32x2x2 = vec<vec<i32, 2>, 2>;
+using u64x2x2 = vec<vec<u64, 2>, 2>;
+using i64x2x2 = vec<vec<i64, 2>, 2>;
+using f32x2x2 = vec<vec<f32, 2>, 2>;
+using f64x2x2 = vec<vec<f64, 2>, 2>;
+
+using u8x4x4  = vec<vec<u8, 4>, 4>;
+using i8x4x4  = vec<vec<i8, 4>, 4>;
+using u16x4x4 = vec<vec<u16, 4>, 4>;
+using i16x4x4 = vec<vec<i16, 4>, 4>;
+using u32x4x4 = vec<vec<u32, 4>, 4>;
+using i32x4x4 = vec<vec<i32, 4>, 4>;
+using u64x4x4 = vec<vec<u64, 4>, 4>;
+using i64x4x4 = vec<vec<i64, 4>, 4>;
+using f32x4x4 = vec<vec<f32, 4>, 4>;
+using f64x4x4 = vec<vec<f64, 4>, 4>;
+
+template <size_t N1, size_t N2>
+struct vec_vec_template
+{
+    template <typename T>
+    using type = vec<vec<T, N1>, N2>;
+};
+
+} // namespace CMT_ARCH_NAME
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<T1, N>, kfr::vec<T2, N>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<T1, N>, T2>
+    : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<T1, kfr::vec<T2, N>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type>
+{
+};
+
+template <typename T1, typename T2, size_t N1, size_t N2>
+struct common_type_impl<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_vec_template<N1, N2>::template type>
+{
+    using type = kfr::vec<kfr::vec<typename common_type_impl<T1, T2>::type, N1>, N2>;
+};
+template <typename T1, typename T2, size_t N1, size_t N2>
+struct common_type_impl<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>>
+    : common_type_from_subtypes<T1, T2, kfr::vec_vec_template<N1, N2>::template type>
+{
+};
+} // namespace kfr
+
+namespace cometa
+{
+
+template <typename T, size_t N>
+struct compound_type_traits<kfr::vec_shape<T, N>>
+{
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
+
+    template <typename U>
+    using rebind = kfr::vec_shape<U, N>;
+    template <typename U>
+    using deep_rebind = kfr::vec_shape<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
+};
+
+template <typename T, size_t N>
+struct compound_type_traits<kfr::vec<T, N>>
+{
+    using subtype                      = T;
+    using deep_subtype                 = cometa::deep_subtype<T>;
+    constexpr static size_t width      = N;
+    constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+    constexpr static bool is_scalar    = false;
+    constexpr static size_t depth      = cometa::compound_type_traits<T>::depth + 1;
+    template <typename U>
+    using rebind = kfr::vec<U, N>;
+    template <typename U>
+    using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
+
+    KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::vec<T, N>& value, size_t index)
+    {
+        return value[index];
+    }
+};
+
+namespace details
+{
+template <typename T, size_t N>
+struct flt_type_impl<kfr::vec<T, N>>
+{
+    using type = kfr::vec<typename flt_type_impl<T>::type, N>;
+};
+} // namespace details
+} // namespace cometa
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/testo/assert.hpp b/include/kfr/testo/assert.hpp
@@ -1,4 +1,7 @@
-#pragma once
+/** @addtogroup testo
+ *  @{
+ */
+#pragma once
 
 #include "comparison.hpp"
 
diff --git a/include/kfr/testo/comparison.hpp b/include/kfr/testo/comparison.hpp
@@ -1,4 +1,7 @@
-#pragma once
+/** @addtogroup testo
+ *  @{
+ */
+#pragma once
 
 #include "../cometa/tuple.hpp"
 
@@ -26,7 +29,7 @@ struct comparison
     R right;
     Fn cmp;
 
-    comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)) {}
+    comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)), cmp() {}
 
     bool operator()() const { return cmp(left, right); }
 };
@@ -53,28 +56,51 @@ CMT_PRAGMA_GNU(GCC diagnostic push)
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
 
 template <typename T>
-inline T& epsilon()
+inline T& current_epsilon()
 {
     static T value = std::numeric_limits<T>::epsilon();
     return value;
 }
 
+template <typename T>
+struct eplison_scope
+{
+    eplison_scope(T scale) { current_epsilon<T>() = std::numeric_limits<T>::epsilon() * scale; }
+    ~eplison_scope() { current_epsilon<T>() = saved; }
+    T saved = current_epsilon<T>();
+};
+
+template <>
+struct eplison_scope<void>
+{
+    eplison_scope(float scale) : f(scale), d(scale), ld(scale) {}
+    eplison_scope<float> f;
+    eplison_scope<double> d;
+    eplison_scope<long double> ld;
+};
+
 template <>
 struct equality_comparer<float, float>
 {
-    bool operator()(const float& l, const float& r) const { return !(std::abs(l - r) > epsilon<float>()); }
+    bool operator()(const float& l, const float& r) const
+    {
+        return !(std::abs(l - r) > current_epsilon<float>());
+    }
 };
 template <>
 struct equality_comparer<double, double>
 {
-    bool operator()(const double& l, const double& r) const { return !(std::abs(l - r) > epsilon<double>()); }
+    bool operator()(const double& l, const double& r) const
+    {
+        return !(std::abs(l - r) > current_epsilon<double>());
+    }
 };
 template <>
 struct equality_comparer<long double, long double>
 {
     bool operator()(const long double& l, const long double& r) const
     {
-        return !(std::abs(l - r) > epsilon<long double>());
+        return !(std::abs(l - r) > current_epsilon<long double>());
     }
 };
 
diff --git a/include/kfr/testo/console_colors.hpp b/include/kfr/testo/console_colors.hpp
@@ -0,0 +1,166 @@
+#pragma once
+#include <cstdint>
+#include <cstdio>
+
+//#define CONSOLE_COLORS_FORCE_ASCII
+
+#if defined _WIN32 && !defined PRINT_COLORED_FORCE_ASCII
+#define USE_WIN32_API
+#endif
+
+#if defined(USE_WIN32_API)
+
+namespace win32_lite
+{
+typedef void* HANDLE;
+typedef uint32_t DWORD;
+
+#define WIN32_LITE_STD_INPUT_HANDLE ((win32_lite::DWORD)-10)
+#define WIN32_LITE_STD_OUTPUT_HANDLE ((win32_lite::DWORD)-11)
+#define WIN32_LITE_STD_ERROR_HANDLE ((win32_lite::DWORD)-12)
+
+#define WIN32_LITE_ENABLE_VIRTUAL_TERMINAL_PROCESSING (4)
+
+#define WIN32_LITE_DECLSPEC_IMPORT __declspec(dllimport)
+
+#define WIN32_LITE_WINAPI __stdcall
+
+typedef short SHORT;
+typedef unsigned short WORD;
+typedef int WINBOOL;
+
+extern "C"
+{
+    WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI GetConsoleMode(HANDLE hConsole, DWORD* dwMode);
+    WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleMode(HANDLE hConsole, DWORD dwMode);
+    WIN32_LITE_DECLSPEC_IMPORT HANDLE WIN32_LITE_WINAPI GetStdHandle(DWORD nStdHandle);
+    WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleTextAttribute(HANDLE hConsoleOutput,
+                                                                                 WORD wAttributes);
+}
+} // namespace win32_lite
+
+#endif
+
+namespace console_colors
+{
+
+enum text_color : uint32_t
+{
+    Black         = 0x00,
+    DarkBlue      = 0x01,
+    DarkGreen     = 0x02,
+    DarkCyan      = 0x03,
+    DarkRed       = 0x04,
+    DarkMagenta   = 0x05,
+    DarkYellow    = 0x06,
+    LightGrey     = 0x07,
+    Gray          = 0x08,
+    Blue          = 0x09,
+    Green         = 0x0A,
+    Cyan          = 0x0B,
+    Red           = 0x0C,
+    Magenta       = 0x0D,
+    Yellow        = 0x0E,
+    White         = 0x0F,
+    BgBlack       = 0x00,
+    BgDarkBlue    = 0x10,
+    BgDarkGreen   = 0x20,
+    BgDarkCyan    = 0x30,
+    BgDarkRed     = 0x40,
+    BgDarkMagenta = 0x50,
+    BgDarkYellow  = 0x60,
+    BgLightGrey   = 0x70,
+    BgGray        = 0x80,
+    BgBlue        = 0x90,
+    BgGreen       = 0xA0,
+    BgCyan        = 0xB0,
+    BgRed         = 0xC0,
+    BgMagenta     = 0xD0,
+    BgYellow      = 0xE0,
+    BgWhite       = 0xF0,
+
+    Normal = BgBlack | LightGrey
+};
+
+enum console_buffer
+{
+    ConsoleStdOutput,
+    ConsoleStdError
+};
+
+struct console_color
+{
+public:
+    console_color(text_color c, console_buffer console = ConsoleStdOutput)
+        : m_old(get(console)), m_console(console)
+    {
+        set(c, m_console);
+    }
+
+    ~console_color() { set(m_old, m_console); }
+
+private:
+    text_color get(console_buffer = ConsoleStdOutput) { return saved_color(); }
+
+    void set(text_color new_color, console_buffer console = ConsoleStdOutput)
+    {
+#ifdef USE_WIN32_API
+        win32_lite::SetConsoleTextAttribute(win32_lite::GetStdHandle(console == ConsoleStdOutput
+                                                                         ? WIN32_LITE_STD_OUTPUT_HANDLE
+                                                                         : WIN32_LITE_STD_ERROR_HANDLE),
+                                            static_cast<win32_lite::WORD>(new_color));
+#else
+        if (new_color != Normal)
+        {
+            uint8_t t    = new_color & 0xF;
+            uint8_t b    = (new_color & 0xF0) >> 4;
+            uint8_t tnum = 30 + ((t & 1) << 2 | (t & 2) | (t & 4) >> 2);
+            uint8_t bnum = 40 + ((b & 1) << 2 | (b & 2) | (b & 4) >> 2);
+            if (t & 8)
+                tnum += 60;
+            if (b & 8)
+                bnum += 60;
+            std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[%d;%dm", tnum, bnum);
+        }
+        else
+        {
+            std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[0m");
+        }
+#endif
+        saved_color() = new_color;
+    }
+
+    text_color m_old;
+    console_buffer m_console;
+    static text_color& saved_color()
+    {
+        static text_color color = Normal;
+        return color;
+    }
+};
+
+template <text_color color, console_buffer console = ConsoleStdOutput>
+struct console_color_tpl : public console_color
+{
+public:
+    console_color_tpl() : console_color(color, console) {}
+
+private:
+};
+
+typedef console_color_tpl<DarkBlue> darkblue_text;
+typedef console_color_tpl<DarkGreen> darkgreen_text;
+typedef console_color_tpl<DarkCyan> darkcyan_text;
+typedef console_color_tpl<DarkRed> darkred_text;
+typedef console_color_tpl<DarkMagenta> darkmagenta_text;
+typedef console_color_tpl<DarkYellow> darkyellow_text;
+typedef console_color_tpl<LightGrey> lightgrey_text;
+typedef console_color_tpl<Gray> gray_text;
+typedef console_color_tpl<Blue> blue_text;
+typedef console_color_tpl<Green> green_text;
+typedef console_color_tpl<Cyan> cyan_text;
+typedef console_color_tpl<Red> red_text;
+typedef console_color_tpl<Magenta> magenta_text;
+typedef console_color_tpl<Yellow> yellow_text;
+typedef console_color_tpl<White> white_text;
+} // namespace console_colors
diff --git a/include/kfr/testo/double_double.hpp b/include/kfr/testo/double_double.hpp
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <algorithm>
+#include <bitset>
+#include <cmath>
+#include <cstring>
+
+struct precise_fp
+{
+    int sign; // 1 means '+', -1 means '-', can't be 0
+    int exponent; // unbiased, INT_MIN means 0/denormal, INT_MAX means inf/nan
+    uint64_t mantissa; // with explicit first bit set, 63 significant bits
+
+    bool is_zero() const { return exponent == INT_MIN && mantissa == 0; }
+    bool is_denormal() const { return exponent == INT_MIN && mantissa != 0; }
+    bool is_inf() const { return exponent == INT_MAX && mantissa == 0; }
+    bool is_nan() const { return exponent == INT_MAX && mantissa != 0; }
+
+    double to_double() const { return sign * std::ldexp(static_cast<double>(mantissa), exponent); }
+    float to_float() const { return sign * std::ldexp(static_cast<float>(mantissa), exponent); }
+
+    precise_fp(int sign, int exponent, uint64_t mantissa) : sign(sign), exponent(exponent), mantissa(mantissa)
+    {
+    }
+
+    template <typename T>
+    explicit precise_fp(T value)
+    {
+        sign = static_cast<int>(std::copysign(T(1), value));
+        if (value == 0)
+        {
+            mantissa = 0;
+            exponent = INT_MIN;
+        }
+        else if (std::isinf(value))
+        {
+            mantissa = 0;
+            exponent = INT_MAX;
+        }
+        else if (std::isnan(value))
+        {
+            mantissa = 1;
+            exponent = INT_MAX;
+        }
+        else
+        {
+            mantissa = 0x80000000'00000000ull * std::frexp(value, &exponent);
+        }
+    }
+
+    friend double precise_ulps(const precise_fp& x, const float& y)
+    {
+        return precise_ulps(x, precise_fp(y), -126, 24);
+    }
+    friend double precise_ulps(const precise_fp& x, const double& y)
+    {
+        return precise_ulps(x, precise_fp(y), -1022, 53);
+    }
+
+    friend double precise_ulps(const precise_fp& x, const precise_fp& y, int minexponent, int mantissabits)
+    {
+        if (x.is_zero() && y.is_zero())
+            return 0;
+        if (x.is_nan() && y.is_nan())
+            return 0;
+        if (x.is_inf() && y.is_inf())
+            return x.sign == y.sign ? 0 : HUGE_VAL;
+        if (x.is_zero() && y.is_zero())
+            return 0;
+
+        if (x.sign != y.sign)
+            return HUGE_VAL;
+        uint64_t xx      = x.mantissa;
+        uint64_t yy      = y.mantissa;
+        const int minexp = std::min(x.exponent, y.exponent);
+        if (x.exponent - minexp <= 1 && y.exponent - minexp <= 1)
+        {
+            xx >>= y.exponent - minexp;
+            yy >>= x.exponent - minexp;
+            return static_cast<double>(xx > yy ? xx - yy : yy - xx) / (1 << (63 - mantissabits));
+        }
+        return HUGE_VAL;
+    }
+};
+
+struct double_double
+{
+    double hi, lo;
+
+    static_assert(sizeof(double) == 8, "");
+
+    constexpr double_double(double x) noexcept : hi(x), lo(0.0) {}
+    constexpr double_double(float x) noexcept : hi(x), lo(0.0) {}
+    constexpr double_double(double hi, double lo) noexcept : hi(hi + lo), lo((hi - (hi + lo)) + lo) {}
+    constexpr operator double() const noexcept { return hi + lo; }
+    constexpr operator float() const noexcept { return hi + lo; }
+
+    constexpr static double abs(double x) noexcept { return x >= 0 ? x : -x; }
+
+    constexpr friend double_double operator-(const double_double& x) noexcept { return { -x.hi, -x.lo }; }
+    constexpr friend double_double operator+(const double_double& x, const double_double& y) noexcept
+    {
+        const double sum = x.hi + y.hi;
+        return { sum, abs(x.hi) > abs(y.hi) ? (((x.hi - sum) + y.hi) + y.lo) + x.lo
+                                            : (((y.hi - sum) + x.hi) + x.lo) + y.lo };
+    }
+    constexpr friend double_double operator-(const double_double& x, const double_double& y) noexcept
+    {
+        const double diff = x.hi - y.hi;
+        return { diff, abs(x.hi) > abs(y.hi) ? (((x.hi - diff) - y.hi) - y.lo) + x.lo
+                                             : (((-y.hi - diff) + x.hi) + x.lo) - y.lo };
+    }
+    constexpr friend double_double operator*(const double_double& x, const double_double& y) noexcept
+    {
+        const double_double c = mul(x.hi, y.hi);
+        const double cc       = (x.hi * y.lo + x.lo * y.hi) + c.lo;
+        return { c.hi, cc };
+    }
+    constexpr friend double_double operator/(const double_double& x, const double_double& y) noexcept
+    {
+        const double c        = x.hi / y.hi;
+        const double_double u = mul(c, y.hi);
+        const double cc       = ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) / y.hi;
+        return { c, cc };
+    }
+
+    bool isinf() const noexcept { return std::isinf(hi); }
+    bool isnan() const noexcept { return std::isnan(hi) || std::isnan(lo); }
+    bool iszero() const noexcept { return hi == 0 && lo == 0; }
+
+    double ulp(float value) const noexcept
+    {
+        if (std::isnan(value) && isnan())
+            return 0.0;
+        if (std::isinf(value) && isinf() && (std::copysign(1.0f, value) == std::copysign(1.0, hi)))
+            return 0.0;
+        if (value == 0 && iszero())
+            return 0.0;
+        if (std::nexttoward(value, 0.0) == 0.0 && iszero())
+            return 1.0;
+        return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
+    }
+    double ulp(double value) const noexcept
+    {
+        if (std::isnan(value) && isnan())
+            return 0.0;
+        if (std::isinf(value) && isinf() && (std::copysign(1.0, value) == std::copysign(1.0, hi)))
+            return 0.0;
+        if (value == 0 && iszero())
+            return 0.0;
+        if (std::nexttoward(value, 0.0) == 0.0 && iszero())
+            return 1.0;
+        return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
+    }
+
+private:
+    constexpr static double_double splitprec(double x) noexcept
+    {
+        const double p = x * 1.34217729e8;
+        const double h = (x - p) + p;
+        return { h, x - h };
+    }
+    constexpr static double_double mul(double x, double y) noexcept
+    {
+        const double_double xx = splitprec(x);
+        const double_double yy = splitprec(y);
+        const double z         = x * y;
+        return { z, ((xx.hi * yy.hi - z) + xx.hi * yy.lo + xx.lo * yy.hi) + xx.lo * yy.lo };
+    }
+};
diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp
@@ -1,4 +1,7 @@
-#pragma once
+/** @addtogroup testo
+ *  @{
+ */
+#pragma once
 
 #include "comparison.hpp"
 
@@ -12,7 +15,8 @@
 #include <mpfr/mpfr.hpp>
 #include <mpfr/mpfr_tostring.hpp>
 #endif
-#include "../ext/console_colors.hpp"
+#include "console_colors.hpp"
+#include <cassert>
 #include <chrono>
 #include <cmath>
 
@@ -21,6 +25,7 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wexit-time-destructors")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpadded")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses")
 
 namespace testo
 {
@@ -101,6 +106,15 @@ inline test_case*& active_test()
     return instance;
 }
 
+struct scope
+{
+    std::string text;
+    test_case* current_test;
+    scope* parent;
+    scope(std::string text);
+    ~scope();
+};
+
 struct test_case
 {
     using test_func = void (*)();
@@ -155,12 +169,14 @@ struct test_case
             }
             console_color cc(White);
         }
+        subtests.clear();
         return !failed;
     }
 
     void check(bool result, const std::string& value, const char* expr)
     {
-        subtests.push_back(subtest{ result, as_string(padleft(22, expr), " | ", value), comment });
+        subtests.push_back(
+            subtest{ result, as_string(padleft(22, expr), " | ", value), current_scope_text() });
         result ? success++ : failed++;
         if (show_progress)
         {
@@ -191,43 +207,59 @@ struct test_case
         check(result, as_string(comparison.left), expr);
     }
 
-    void append_comment(const std::string& text)
+    struct subtest
+    {
+        bool success;
+        std::string text;
+        std::string comment;
+    };
+
+    void scope_changed()
     {
-        comment += text;
         if (show_progress)
         {
             println();
-            println(text, ":");
+            println(current_scope_text(), ":");
         }
     }
-
-    void set_comment(const std::string& text)
+    std::string current_scope_text() const
     {
-        comment = text;
-        if (show_progress)
+        scope* s = this->current_scope;
+        std::string result;
+        while (s)
         {
-            println();
-            println(text, ":");
+            if (!result.empty())
+                result = "; " + result;
+            result = s->text + result;
+            s      = s->parent;
         }
+        return result;
     }
 
-    struct subtest
-    {
-        bool success;
-        std::string text;
-        std::string comment;
-    };
-
     test_func func;
     const char* name;
     std::vector<subtest> subtests;
-    std::string comment;
     int success;
     int failed;
     double time;
     bool show_progress;
+    scope* current_scope = nullptr;
 };
 
+inline scope::scope(std::string text)
+    : text(std::move(text)), current_test(active_test()), parent(current_test->current_scope)
+{
+    current_test->current_scope = this;
+    current_test->scope_changed();
+}
+
+inline scope::~scope()
+{
+    assert(active_test() == current_test);
+    assert(current_test->current_scope == this);
+    current_test->current_scope = parent;
+}
+
 template <typename Number>
 struct statistics
 {
@@ -267,10 +299,10 @@ template <typename Arg0, typename Fn>
 void matrix(named_arg<Arg0>&& arg0, Fn&& fn)
 {
     cforeach(std::forward<Arg0>(arg0.value), [&](auto v0) {
-        active_test()->set_comment(as_string(arg0.name, " = ", v0));
+        scope s(as_string(arg0.name, " = ", v0));
         fn(v0);
     });
-    if (active_test()->show_progress)
+    if (active_test() && active_test()->show_progress)
         println();
 }
 
@@ -278,7 +310,7 @@ template <typename Arg0, typename Arg1, typename Fn>
 void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, Fn&& fn)
 {
     cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), [&](auto v0, auto v1) {
-        active_test()->set_comment(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1));
+        scope s(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1));
         fn(v0, v1);
     });
     if (active_test()->show_progress)
@@ -290,7 +322,7 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar
 {
     cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), std::forward<Arg2>(arg2.value),
              [&](auto v0, auto v1, auto v2) {
-                 active_test()->set_comment(
+                 scope s(
                      as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1, ", ", arg2.name, " = ", v2));
                  fn(v0, v1, v2);
              });
@@ -298,27 +330,53 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar
         println();
 }
 
+template <typename Arg0, typename Arg1, typename Arg2, typename Arg3, typename Fn>
+void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& arg2, named_arg<Arg3>&& arg3,
+            Fn&& fn)
+{
+    cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), std::forward<Arg2>(arg2.value),
+             std::forward<Arg3>(arg3.value), [&](auto v0, auto v1, auto v2, auto v3) {
+                 scope s(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1, ", ", arg2.name, " = ",
+                                   v2, arg3.name, " = ", v3));
+                 fn(v0, v1, v2, v3);
+             });
+    if (active_test()->show_progress)
+        println();
+}
+
 CMT_UNUSED static int run_all(const std::string& name = std::string(), bool show_successful = false)
 {
     std::vector<test_case*> success;
     std::vector<test_case*> failed;
+    int success_checks = 0;
+    int failed_checks  = 0;
     for (test_case* t : test_case::tests())
     {
         if (name.empty() || t->name == name)
+        {
             t->run(show_successful) ? success.push_back(t) : failed.push_back(t);
+            success_checks += t->success;
+            failed_checks += t->failed;
+        }
     }
     printfmt("{}\n", std::string(79, '='));
     if (!success.empty())
     {
         console_color cc(Green);
         printfmt("[{}]", padcenter(11, "SUCCESS", '-'));
-        printfmt(" {} tests\n", success.size());
+        printfmt(" {}/{} tests {}/{} checks\n", success.size(), success.size() + failed.size(),
+                 success_checks, success_checks + failed_checks);
     }
     if (!failed.empty())
     {
         console_color cc(Red);
         printfmt("[{}]", padcenter(11, "ERROR", '-'));
-        printfmt(" {} tests\n", failed.size());
+        printfmt(" {}/{} tests {}/{} checks\n", failed.size(), success.size() + failed.size(), failed_checks,
+                 success_checks + failed_checks);
+        for (test_case* t : failed)
+        {
+            print("              ", t->name, "\n");
+        }
     }
     return static_cast<int>(failed.size());
 }
@@ -334,6 +392,13 @@ void assert_is_same_decay()
     static_assert(std::is_same<cometa::decay<T1>, cometa::decay<T2>>::value, "");
 }
 
+template <typename T, size_t NArgs>
+struct test_data_entry
+{
+    T arguments[NArgs];
+    T result;
+};
+
 #define TESTO_CHECK(...)                                                                                     \
     do                                                                                                       \
     {                                                                                                        \
@@ -354,6 +419,7 @@ void assert_is_same_decay()
 #define TEST TESTO_TEST
 #define DTEST TESTO_DTEST
 #endif
+
 } // namespace testo
 
 CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/version.hpp b/include/kfr/version.hpp
@@ -25,8 +25,7 @@
  */
 #pragma once
 
-#include "base/types.hpp"
-#include "cpuid/cpuid_auto.hpp"
+#include "runtime/cpuid_auto.hpp"
 
 namespace kfr
 {
diff --git a/sources.cmake b/sources.cmake
@@ -7,99 +7,52 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/all.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/cpuid.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/math.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/runtime.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/version.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cident.h
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/abs.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/asin_acos.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/atan.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/kfr.h
     ${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/bitwise.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/clamp.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/comparison.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/compiletime.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/complex.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/constants.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/digitreverse.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/expression.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/filter.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/fraction.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/function.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/gamma.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/base/function_expressions.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/horizontal.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/hyperbolic.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/logical.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/log_exp.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/min_max.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/modzerobessel.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/operators.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/platform.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/read_write.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/round.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/saturation.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/select.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/shuffle.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_clang.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_intrin.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_x86.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/sin_cos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/sqrt.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/tan.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/types.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/vec.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/intrinsics.h
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/kfr.h
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/specializations.i
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/abs.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/asin_acos.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/atan.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/clamp.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/gamma.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/hyperbolic.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/logical.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/log_exp.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/min_max.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/modzerobessel.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/round.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/saturation.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/select.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/sin_cos.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/sqrt.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/tan.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/array.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/cstring.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/ctti.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/function.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/named_arg.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/cometa/numeric.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/range.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/result.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/string.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/cometa/tuple.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/cpuid/cpuid.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/cpuid/cpuid_auto.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/data/bitrev.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/data/sincos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/cache.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/convolution.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/fft.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/reference_dft.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/dft_c.h
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/bitrev.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/sincos.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/bitrev.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-fft.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-templates.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-templates.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/ft.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad_design.hpp
@@ -110,7 +63,6 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/fir_design.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/fracdelay.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/goertzel.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/dsp/interpolation.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/mixdown.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/oscillators.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/sample_rate_conversion.hpp
@@ -120,15 +72,114 @@ set(
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/waveshaper.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/weighting.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/dsp/window.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/ext/console_colors.hpp
-    ${PROJECT_SOURCE_DIR}/include/kfr/ext/double_double.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/audiofile.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/file.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/python_plot.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/tostring.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h
     ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/abs.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/clamp.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/complex_math.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/gamma.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/hyperbolic.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/interpolation.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/logical.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/log_exp.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/min_max.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/modzerobessel.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/round.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/saturation.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/select.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/sin_cos.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/sqrt.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/tan.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/abs.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/asin_acos.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/atan.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/clamp.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/gamma.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/hyperbolic.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/logical.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/log_exp.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/min_max.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/modzerobessel.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/round.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/saturation.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/select.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sin_cos.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sqrt.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/tan.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid_auto.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/comparison.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/complex.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/constants.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/digitreverse.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/horizontal.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/mask.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/operators.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/platform.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/read_write.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/shuffle.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/types.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/vec.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_clang.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_generic.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_clang.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h
+    ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i
     ${PROJECT_SOURCE_DIR}/include/kfr/testo/assert.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/testo/comparison.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/testo/console_colors.hpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/testo/double_double.hpp
     ${PROJECT_SOURCE_DIR}/include/kfr/testo/testo.hpp
 )
+
+    
+set(
+    KFR_DFT_SRC
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/convolution-impl.cpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f32.cpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f64.cpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-src.cpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f32.cpp
+    ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f64.cpp
+)
+
+    
+set(
+    KFR_IO_SRC
+    ${PROJECT_SOURCE_DIR}/include/kfr/io/impl/audiofile-impl.cpp
+)
+
+    
+set(
+    KFR_UNITTEST_SRC
+    ${PROJECT_SOURCE_DIR}/tests/unit/base/conversion.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/base/reduce.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/abs.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/asin_acos.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/atan.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/hyperbolic.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/log_exp.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/min_max.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/round.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/select.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/sin_cos.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/math/tan.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/complex.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/operators.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/shuffle.cpp
+    ${PROJECT_SOURCE_DIR}/tests/unit/simd/vec.cpp
+)
+
+    
+\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -15,12 +15,26 @@
 # along with KFR.
 
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.1)
 
 add_definitions(-DKFR_TESTING=1)
+add_definitions(-DKFR_SRC_DIR=\"${CMAKE_SOURCE_DIR}\")
+
+# Binary output directories
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
 
+if (ENABLE_ASMTEST)
+    add_executable(asm_test asm_test.cpp)
+    target_link_libraries(asm_test kfr)
+    target_set_arch(asm_test PRIVATE avx)
+    target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED)
+
+    add_custom_command(TARGET asm_test POST_BUILD COMMAND objconv -fyasm $<TARGET_FILE:asm_test>)
+endif()
+
 if (NOT ARM)
     if(MSVC AND NOT CLANG)
         add_executable(multiarch multiarch.cpp multiarch_fir_sse2.cpp multiarch_fir_avx.cpp)
@@ -34,67 +48,96 @@ if (NOT ARM)
     target_link_libraries(multiarch kfr)
 endif ()
 
-find_package(MPFR)
-find_package(GMP)
-
 set(ALL_TESTS_CPP
-    all_tests.cpp
-    base_test.cpp
-    complex_test.cpp
-    dsp_test.cpp
-    expression_test.cpp
-    intrinsic_test.cpp
-    io_test.cpp
-    resampler_test.cpp)
+        base_test.cpp
+        complex_test.cpp
+        dsp_test.cpp
+        expression_test.cpp
+        intrinsic_test.cpp
+        io_test.cpp
+        ${KFR_UNITTEST_SRC})
+
+# set(ALL_TESTS_MERGED_CPP all_tests_merged.cpp)
 
 if (ENABLE_DFT)
     list(APPEND ALL_TESTS_CPP dft_test.cpp)
 endif ()
 
+find_package(MPFR)
+find_package(GMP)
+
 if (MPFR_FOUND AND GMP_FOUND)
-    list(APPEND ALL_TESTS_CPP transcendental_test.cpp)
-else ()
-    message(STATUS "MPFR is not found. Skipping transcendental_test")
+    message(STATUS "MPFR is found")
+    add_executable(generate_data generate_data.cpp)
+    target_link_libraries(generate_data kfr)
+    target_include_directories(generate_data PRIVATE ${MPFR_INCLUDE_DIR} ${GMP_INCLUDE_DIR})
+    target_link_libraries(generate_data ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
+    if (REGENERATE_TESTS)
+        add_custom_command(TARGET generate_data POST_BUILD
+                COMMENT "Generating tests..."
+                WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tests/data
+                COMMAND generate_data)
+    endif ()
 endif ()
 
-add_executable(all_tests ${ALL_TESTS_CPP})
+add_executable(all_tests all_tests.cpp ${ALL_TESTS_CPP})
 target_compile_definitions(all_tests PRIVATE KFR_NO_MAIN)
+target_link_libraries(all_tests kfr use_arch)
 if (ENABLE_DFT)
-    target_link_libraries(all_tests kfr kfr_dft)
+    target_link_libraries(all_tests kfr_dft)
 endif ()
-target_link_libraries(all_tests kfr kfr_io)
+target_link_libraries(all_tests kfr_io)
 
-if (MPFR_FOUND AND GMP_FOUND)
-    add_definitions(-DHAVE_MPFR)
-    include_directories(${MPFR_INCLUDE_DIR} ${GMP_INCLUDE_DIR})
-    target_link_libraries(all_tests ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
-endif ()
+function(add_x86_test ARCH)
+    set(NAME ${ARCH})
 
-function(add_x86_test NAME FLAGS)
-    separate_arguments(FLAGS)
-    add_executable(all_tests_${NAME} ${ALL_TESTS_CPP}  ${KFR_IO_SRC})
+    add_executable(all_tests_${NAME} all_tests.cpp ${ALL_TESTS_CPP}  ${KFR_IO_SRC})
     if (ENABLE_DFT)
         target_sources(all_tests_${NAME} PRIVATE ${KFR_DFT_SRC})
     endif ()
-    target_compile_options(all_tests_${NAME} PRIVATE ${FLAGS})
-    target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN)
     target_link_libraries(all_tests_${NAME} kfr)
+    target_set_arch(all_tests_${NAME} PRIVATE ${ARCH})
+    target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN)
     target_compile_definitions(all_tests_${NAME} PUBLIC KFR_ENABLE_FLAC=1)
-    if (MPFR_FOUND AND GMP_FOUND)
-        target_link_libraries(all_tests_${NAME} ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
+
+    if (ARCH_TESTS_MULTI)
+        add_library(all_tests_multiarch_${NAME} STATIC ${ALL_TESTS_MERGED_CPP}  ${KFR_IO_SRC})
+        if (ENABLE_DFT)
+            target_sources(all_tests_multiarch_${NAME} PRIVATE ${KFR_DFT_SRC})
+        endif ()
+        target_link_libraries(all_tests_multiarch_${NAME} kfr)
+        target_set_arch(all_tests_multiarch_${NAME} PRIVATE ${ARCH})
+        target_compile_definitions(all_tests_multiarch_${NAME} PRIVATE KFR_NO_MAIN)
+        target_compile_definitions(all_tests_multiarch_${NAME} PUBLIC KFR_ENABLE_FLAC=1)
     endif ()
+
 endfunction()
 
 if (ARCH_TESTS)
-    set (ARCH_RESET "-march=x86-64 -mno-sse3 -mno-ssse3 -mno-sse4.1 -mno-sse4.2 -mno-avx -mno-avx2 -mno-fma -mno-avx512f -mno-avx512cd -mno-avx512bw -mno-avx512dq -mno-avx512vl")
-    add_x86_test(generic "${ARCH_RESET} -DCMT_FORCE_GENERIC_CPU")
-    add_x86_test(sse2 "${ARCH_RESET} -msse2")
-    add_x86_test(sse3 "${ARCH_RESET} -msse3 -mno-avx")
-    add_x86_test(ssse3 "${ARCH_RESET} -mssse3 -mno-avx")
-    add_x86_test(sse41 "${ARCH_RESET} -msse4.1 -mno-avx")
-    add_x86_test(avx "${ARCH_RESET} -msse4.1 -mavx")
-    add_x86_test(avx2 "${ARCH_RESET} -msse4.1 -mavx2 -mfma")
-    add_x86_test(avx512 "${ARCH_RESET} -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
+    if (NOT MSVC OR CLANG)
+        add_x86_test(generic)
+    endif ()
+    add_x86_test(sse2)
+    add_x86_test(sse3)
+    add_x86_test(ssse3)
+    add_x86_test(sse41)
+    add_x86_test(avx)
+    add_x86_test(avx2)
+    add_x86_test(avx512)
+
+    if (ARCH_TESTS_MULTI)
+        add_executable(all_tests_multiarch all_tests.cpp)
+        target_compile_definitions(all_tests_multiarch PRIVATE KFR_MULTI_ARCH)
+        target_link_libraries(all_tests_multiarch
+                all_tests_multiarch_sse2
+                all_tests_multiarch_sse3
+                all_tests_multiarch_ssse3
+                all_tests_multiarch_sse41
+                all_tests_multiarch_avx
+                all_tests_multiarch_avx2
+                all_tests_multiarch_avx512
+                )
+    endif ()
 endif()
 
 if(USE_SDE)
diff --git a/tests/all_tests.cpp b/tests/all_tests.cpp
@@ -7,6 +7,24 @@
 
 using namespace kfr;
 
+#ifdef KFR_MULTI_ARCH
+
+#define FORCE_LINK(arch)                                                                                     \
+    namespace arch                                                                                           \
+    {                                                                                                        \
+    extern void force_link();                                                                                \
+    void (*p)() = &force_link;                                                                               \
+    }
+
+FORCE_LINK(sse2)
+FORCE_LINK(sse3)
+FORCE_LINK(ssse3)
+FORCE_LINK(sse41)
+FORCE_LINK(avx)
+FORCE_LINK(avx2)
+// FORCE_LINK(avx512)
+#endif
+
 int main()
 {
     println(library_version(), " running on ", cpu_runtime());
@@ -16,7 +34,7 @@ int main()
         return -1;
     }
 #ifdef HAVE_MPFR
-    mpfr::scoped_precision p(128);
+    mpfr::scoped_precision p(64);
 #endif
     return testo::run_all("");
 }
diff --git a/tests/all_tests_merged.cpp b/tests/all_tests_merged.cpp
@@ -0,0 +1,25 @@
+#include <kfr/cident.h>
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses")
+
+#include "auto_test.cpp"
+
+#include "base_test.cpp"
+#include "complex_test.cpp"
+#include "dsp_test.cpp"
+#include "expression_test.cpp"
+#include "intrinsic_test.cpp"
+#include "io_test.cpp"
+#include "resampler_test.cpp"
+
+#ifndef KFR_NO_DFT
+#include "dft_test.cpp"
+#endif
+
+namespace CMT_ARCH_NAME
+{
+void force_link() {}
+} // namespace CMT_ARCH_NAME
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/tests/asm_test.cpp b/tests/asm_test.cpp
@@ -0,0 +1,213 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/base.hpp>
+#include <kfr/io.hpp>
+#include <kfr/testo/console_colors.hpp>
+
+using namespace kfr;
+
+#define TEST_ASM_8(fn, ty, MACRO)                                                                            \
+    MACRO(fn, ty, 1)                                                                                         \
+    MACRO(fn, ty, 2)                                                                                         \
+    MACRO(fn, ty, 4)                                                                                         \
+    MACRO(fn, ty, 8)                                                                                         \
+    MACRO(fn, ty, 16)                                                                                        \
+    MACRO(fn, ty, 32)                                                                                        \
+    MACRO(fn, ty, 64)
+
+#define TEST_ASM_16(fn, ty, MACRO)                                                                           \
+    MACRO(fn, ty, 1)                                                                                         \
+    MACRO(fn, ty, 2)                                                                                         \
+    MACRO(fn, ty, 4)                                                                                         \
+    MACRO(fn, ty, 8)                                                                                         \
+    MACRO(fn, ty, 16)                                                                                        \
+    MACRO(fn, ty, 32)                                                                                        \
+    MACRO(fn, ty, 64)
+
+#define TEST_ASM_32(fn, ty, MACRO)                                                                           \
+    MACRO(fn, ty, 1)                                                                                         \
+    MACRO(fn, ty, 2)                                                                                         \
+    MACRO(fn, ty, 4)                                                                                         \
+    MACRO(fn, ty, 8)                                                                                         \
+    MACRO(fn, ty, 16)                                                                                        \
+    MACRO(fn, ty, 32)
+
+#define TEST_ASM_64(fn, ty, MACRO)                                                                           \
+    MACRO(fn, ty, 1)                                                                                         \
+    MACRO(fn, ty, 2)                                                                                         \
+    MACRO(fn, ty, 4)                                                                                         \
+    MACRO(fn, ty, 8)                                                                                         \
+    MACRO(fn, ty, 16)
+
+#ifdef CMT_COMPILER_MSVC
+#define KFR_PUBLIC CMT_PUBLIC_C CMT_DLL_EXPORT
+#else
+#define KFR_PUBLIC CMT_PUBLIC_C
+#endif
+
+#define TEST_ASM_VTY1(fn, ty, n)                                                                             \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x) { r = kfr::fn(x); }
+
+#define TEST_ASM_VTY1_F(fn, ty, n)                                                                           \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<flt_type<ty>, n>& r, const vec<ty, n>& x)             \
+    {                                                                                                        \
+        r = kfr::fn(x);                                                                                      \
+    }
+
+#define TEST_ASM_VTY2(fn, ty, n)                                                                             \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y)  \
+    {                                                                                                        \
+        r = kfr::fn(x, y);                                                                                   \
+    }                                                                                                        \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__scalar(vec<ty, n>& r, const vec<ty, n>& x,             \
+                                                             const ty& y)                                    \
+    {                                                                                                        \
+        r = kfr::fn(x, y);                                                                                   \
+    }
+#define TEST_ASM_CMP(fn, ty, n)                                                                              \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(mask<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y) \
+    {                                                                                                        \
+        r = kfr::fn(x, y);                                                                                   \
+    }
+#define TEST_ASM_SHIFT(fn, ty, n)                                                                            \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x,                       \
+                                                   const vec<utype<ty>, n>& y)                               \
+    {                                                                                                        \
+        r = kfr::fn(x, y);                                                                                   \
+    }
+#define TEST_ASM_SHIFT_SCALAR(fn, ty, n)                                                                     \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__scalar(vec<ty, n>& r, const vec<ty, n>& x, unsigned y) \
+    {                                                                                                        \
+        r = kfr::fn(x, y);                                                                                   \
+    }
+#define TEST_ASM_VTY3(fn, ty, n)                                                                             \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y,  \
+                                                   const vec<ty, n>& z)                                      \
+    {                                                                                                        \
+        r = kfr::fn(x, y, z);                                                                                \
+    }
+
+#define GEN_ty(n, ty) ty(n)
+#define GEN_arg_def(n, ty) ty arg##n
+#define GEN_arg(n, ty) arg##n
+
+#define TEST_ASM_MAKE_VECTOR(fn, ty, n)                                                                      \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, CMT_GEN_LIST(n, GEN_arg_def, ty))          \
+    {                                                                                                        \
+        r = kfr::fn(CMT_GEN_LIST(n, GEN_arg, ty));                                                           \
+    }                                                                                                        \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__imm(vec<ty, n>& r)                                     \
+    {                                                                                                        \
+        r = kfr::fn(CMT_GEN_LIST(n, GEN_ty, ty));                                                            \
+    }
+
+#define TEST_ASM_BROADCAST(fn, ty, n)                                                                        \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, ty x) { r = kfr::fn<n>(x); }
+
+#define TEST_ASM_HALF1(fn, ty, n)                                                                            \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n * 2>& x) { r = kfr::fn(x); }
+
+#define TEST_ASM_DOUBLE2(fn, ty, n)                                                                          \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x,                   \
+                                                   const vec<ty, n>& y)                                      \
+    {                                                                                                        \
+        r = kfr::fn(x, y);                                                                                   \
+    }
+
+#define TEST_ASM_DOUBLE1(fn, ty, n)                                                                          \
+    KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x) { r = kfr::fn(x); }
+
+#define TEST_ASM_U(fn, MACRO)                                                                                \
+    TEST_ASM_8(fn, u8, MACRO)                                                                                \
+    TEST_ASM_16(fn, u16, MACRO)                                                                              \
+    TEST_ASM_32(fn, u32, MACRO)                                                                              \
+    TEST_ASM_64(fn, u64, MACRO)
+
+#define TEST_ASM_I(fn, MACRO)                                                                                \
+    TEST_ASM_8(fn, i8, MACRO)                                                                                \
+    TEST_ASM_16(fn, i16, MACRO)                                                                              \
+    TEST_ASM_32(fn, i32, MACRO)                                                                              \
+    TEST_ASM_64(fn, i64, MACRO)
+
+#define TEST_ASM_F(fn, MACRO)                                                                                \
+    TEST_ASM_32(fn, f32, MACRO)                                                                              \
+    TEST_ASM_64(fn, f64, MACRO)
+
+#define TEST_ASM_UI(fn, MACRO) TEST_ASM_U(fn, MACRO) TEST_ASM_I(fn, MACRO)
+
+#define TEST_ASM_UIF(fn, MACRO) TEST_ASM_U(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO)
+
+#define TEST_ASM_IF(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO)
+
+TEST_ASM_UIF(add, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(sub, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(mul, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(bitwiseand, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(equal, TEST_ASM_CMP)
+
+TEST_ASM_IF(abs, TEST_ASM_VTY1)
+
+TEST_ASM_IF(sqrt, TEST_ASM_VTY1_F)
+
+TEST_ASM_IF(neg, TEST_ASM_VTY1)
+
+TEST_ASM_UIF(bitwisenot, TEST_ASM_VTY1)
+
+TEST_ASM_UIF(div, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(bitwiseor, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(bitwisexor, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(notequal, TEST_ASM_CMP)
+
+TEST_ASM_UIF(less, TEST_ASM_CMP)
+
+TEST_ASM_UIF(greater, TEST_ASM_CMP)
+
+TEST_ASM_UIF(lessorequal, TEST_ASM_CMP)
+
+TEST_ASM_UIF(greaterorequal, TEST_ASM_CMP)
+
+TEST_ASM_UIF(low, TEST_ASM_HALF1)
+
+TEST_ASM_UIF(high, TEST_ASM_HALF1)
+
+TEST_ASM_UIF(concat, TEST_ASM_DOUBLE2)
+
+TEST_ASM_UIF(shl, TEST_ASM_SHIFT)
+
+TEST_ASM_UIF(shr, TEST_ASM_SHIFT)
+
+TEST_ASM_UIF(shl, TEST_ASM_SHIFT_SCALAR)
+
+TEST_ASM_UIF(shr, TEST_ASM_SHIFT_SCALAR)
+
+TEST_ASM_UIF(duphalfs, TEST_ASM_DOUBLE1)
+
+TEST_ASM_F(sin, TEST_ASM_VTY1_F)
+
+TEST_ASM_F(cos, TEST_ASM_VTY1_F)
+
+TEST_ASM_UIF(sqr, TEST_ASM_VTY1)
+
+TEST_ASM_UIF(make_vector, TEST_ASM_MAKE_VECTOR)
+
+TEST_ASM_UIF(broadcast, TEST_ASM_BROADCAST)
+
+namespace kfr
+{
+#ifdef KFR_SHOW_NOT_OPTIMIZED
+CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); }
+#endif
+} // namespace kfr
+
+int main() { println(library_version()); }
diff --git a/tests/base_test.cpp b/tests/base_test.cpp
@@ -6,11 +6,14 @@
 
 #include <kfr/testo/testo.hpp>
 
-#include <kfr/base.hpp>
 #include <kfr/io.hpp>
+#include <kfr/simd.hpp>
 
 using namespace kfr;
 
+namespace CMT_ARCH_NAME
+{
+
 TEST(test_basic)
 {
     // How to make a vector:
@@ -76,359 +79,20 @@ TEST(test_basic)
     CHECK(odd(numbers1) == vec<int, 4>{ 1, 3, 5, 7 });
     CHECK(even(numbers2) == vec<int, 4>{ 100, 102, 104, 106 });
 
-    // * The following command pairs are equivalent:
-    CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
-    CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
-
-    CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) ==
-          vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
-    CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
-
-    CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) ==
-          vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
-    CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) ==
-          vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
-
-    CHECK(splitpairs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6, 1, 3, 5, 7));
-    CHECK(splitpairs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
-
-    CHECK(interleavehalfs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 4, 1, 5, 2, 6, 3, 7));
-    CHECK(interleavehalfs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
-
     CHECK(subadd(pack(0, 1, 2, 3, 4, 5, 6, 7), pack(10, 10, 10, 10, 10, 10, 10, 10)) ==
           pack(-10, 11, -8, 13, -6, 15, -4, 17));
     CHECK(addsub(pack(0, 1, 2, 3, 4, 5, 6, 7), pack(10, 10, 10, 10, 10, 10, 10, 10)) ==
           pack(10, -9, 12, -7, 14, -5, 16, -3));
 
-    CHECK(broadcast<8>(1) == pack(1, 1, 1, 1, 1, 1, 1, 1));
-    CHECK(broadcast<8>(1, 2) == pack(1, 2, 1, 2, 1, 2, 1, 2));
-    CHECK(broadcast<8>(1, 2, 3, 4) == pack(1, 2, 3, 4, 1, 2, 3, 4));
-    CHECK(broadcast<8>(1, 2, 3, 4, 5, 6, 7, 8) == pack(1, 2, 3, 4, 5, 6, 7, 8));
-
-    CHECK(even(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6));
-    CHECK(odd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 3, 5, 7));
-
-    CHECK(even<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5));
-    CHECK(odd<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(2, 3, 6, 7));
-
-    CHECK(reverse(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(7, 6, 5, 4, 3, 2, 1, 0));
-    CHECK(reverse<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(6, 7, 4, 5, 2, 3, 0, 1));
-    CHECK(reverse<4>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(4, 5, 6, 7, 0, 1, 2, 3));
-
     CHECK(digitreverse4(pack(0.f, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)) ==
           pack(0.f, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));
 
-    CHECK(dup(pack(0, 1, 2, 3)) == pack(0, 0, 1, 1, 2, 2, 3, 3));
-    CHECK(duphalfs(pack(0, 1, 2, 3)) == pack(0, 1, 2, 3, 0, 1, 2, 3));
-    CHECK(dupeven(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 0, 2, 2, 4, 4, 6, 6));
-    CHECK(dupodd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 1, 3, 3, 5, 5, 7, 7));
-
     CHECK(inrange(pack(1, 2, 3), 1, 3) == make_mask<int>(true, true, true));
     CHECK(inrange(pack(1, 2, 3), 1, 2) == make_mask<int>(true, true, false));
     CHECK(inrange(pack(1, 2, 3), 1, 1) == make_mask<int>(true, false, false));
-
-    // * Transpose matrix:
-    const auto sixteen = enumerate<float, 16>();
-    CHECK(transpose<4>(sixteen) == vec<float, 16>(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));
-}
-
-TEST(concat)
-{
-    CHECK(concat(vec<f32, 1>{ 1 }, vec<f32, 2>{ 2, 3 }, vec<f32, 1>{ 4 }, vec<f32, 3>{ 5, 6, 7 }) //
-          == vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 });
-}
-
-TEST(split)
-{
-    vec<f32, 1> a1;
-    vec<f32, 2> a23;
-    vec<f32, 1> a4;
-    vec<f32, 3> a567;
-    split(vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }, a1, a23, a4, a567);
-    CHECK(a1 == vec<f32, 1>{ 1 });
-    CHECK(a23 == vec<f32, 2>{ 2, 3 });
-    CHECK(a4 == vec<f32, 1>{ 4 });
-    CHECK(a567 == vec<f32, 3>{ 5, 6, 7 });
-}
-
-TEST(broadcast)
-{
-    CHECK(broadcast<5>(3.f) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
-    CHECK(broadcast<6>(1.f, 2.f) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
-    CHECK(broadcast<6>(1.f, 2.f, 3.f) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
-}
-
-TEST(resize)
-{
-    CHECK(resize<5>(make_vector(3.f)) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
-    CHECK(resize<6>(make_vector(1.f, 2.f)) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
-    CHECK(resize<6>(make_vector(1.f, 2.f, 3.f)) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
-}
-
-TEST(make_vector)
-{
-    const signed char ch = -1;
-    CHECK(make_vector(1, 2, ch) == vec<i32, 3>{ 1, 2, -1 });
-    const i64 v = -100;
-    CHECK(make_vector(1, 2, v) == vec<i64, 3>{ 1, 2, -100 });
-    CHECK(make_vector<i64>(1, 2, ch) == vec<i64, 3>{ 1, 2, -1 });
-    CHECK(make_vector<f32>(1, 2, ch) == vec<f32, 3>{ 1, 2, -1 });
-
-    CHECK(make_vector(f64x2{ 1, 2 }, f64x2{ 10, 20 }) ==
-          vec<vec<f64, 2>, 2>{ f64x2{ 1, 2 }, f64x2{ 10, 20 } });
-    CHECK(make_vector(1.f, f32x2{ 10, 20 }) == vec<vec<f32, 2>, 2>{ f32x2{ 1, 1 }, f32x2{ 10, 20 } });
-}
-
-TEST(apply)
-{
-    CHECK(apply([](int x) { return x + 1; }, make_vector(1, 2, 3, 4, 5)) == make_vector(2, 3, 4, 5, 6));
-    CHECK(apply(fn::sqr(), make_vector(1, 2, 3, 4, 5)) == make_vector(1, 4, 9, 16, 25));
-}
-
-TEST(zerovector)
-{
-    CHECK(zerovector<f32, 3>() == f32x3{ 0, 0, 0 });
-    // CHECK(zerovector<i16, 3>() == i16x3{ 0, 0, 0 }); // clang 3.9 (trunk) crashes here
-    CHECK(zerovector(f64x8{}) == f64x8{ 0, 0, 0, 0, 0, 0, 0, 0 });
-}
-
-TEST(allonesvector)
-{
-    CHECK(bitcast<u32>(constants<f32>::allones()) == 0xFFFFFFFFu);
-    CHECK(bitcast<u64>(constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull);
-
-    CHECK(~allonesvector<f32, 3>() == f32x3{ 0, 0, 0 });
-    CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 });
-    CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 });
 }
 
-TEST(low_high)
-{
-    CHECK(low(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(1, 2, 3, 4));
-    CHECK(high(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(5, 6, 7, 8));
-
-    CHECK(low(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 4>(1, 2, 3, 4));
-    CHECK(high(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 3>(5, 6, 7));
-
-    CHECK(low(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 4>(1, 2, 3, 4));
-    CHECK(high(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 2>(5, 6));
-
-    CHECK(low(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 4>(1, 2, 3, 4));
-    CHECK(high(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 1>(5));
-
-    CHECK(low(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(1, 2));
-    CHECK(high(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(3, 4));
-
-    CHECK(low(vec<u8, 3>(1, 2, 3)) == vec<u8, 2>(1, 2));
-    CHECK(high(vec<u8, 3>(1, 2, 3)) == vec<u8, 1>(3));
-
-    CHECK(low(vec<u8, 2>(1, 2)) == vec<u8, 1>(1));
-    CHECK(high(vec<u8, 2>(1, 2)) == vec<u8, 1>(2));
-}
-
-#ifdef CMT_COMPILER_CLANG
-TEST(matrix)
-{
-    using i32x2x2 = vec<vec<int, 2>, 2>;
-    const i32x2x2 m22{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
-    CHECK(m22 * 10 == i32x2x2{ i32x2{ 10, 20 }, i32x2{ 30, 40 } });
-
-    CHECK(m22 * i32x2{ -1, 100 } == i32x2x2{ i32x2{ -1, 200 }, i32x2{ -3, 400 } });
-
-    i32x2 xy{ 10, 20 };
-    i32x2x2 m{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
-    xy = hadd(xy * m);
-    CHECK(xy == i32x2{ 40, 120 });
-
-    i32x2 xy2{ 10, 20 };
-    xy2 = hadd(transpose(xy2 * m));
-    CHECK(xy2 == i32x2{ 50, 110 });
-}
-#endif
-
-TEST(is_convertible)
-{
-    static_assert(std::is_convertible<float, f32x4>::value, "");
-    static_assert(std::is_convertible<float, f64x8>::value, "");
-    static_assert(std::is_convertible<float, u8x3>::value, "");
-
-    static_assert(std::is_convertible<u16x4, i32x4>::value, "");
-    static_assert(!std::is_convertible<u16x4, i32x3>::value, "");
-    static_assert(!std::is_convertible<u16x1, u16x16>::value, "");
-
-    static_assert(std::is_convertible<float, complex<float>>::value, "");
-    static_assert(std::is_convertible<float, complex<double>>::value, "");
-    static_assert(std::is_convertible<short, complex<double>>::value, "");
-
-    static_assert(std::is_convertible<complex<float>, vec<complex<float>, 4>>::value, "");
-    static_assert(!std::is_convertible<vec<complex<float>, 1>, vec<complex<float>, 4>>::value, "");
-
-    static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, "");
-    static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, "");
-
-    testo::assert_is_same<i32x4, common_type<i32x4>>();
-    testo::assert_is_same<u32x4, common_type<i32x4, u32x4>>();
-    testo::assert_is_same<f64x4, common_type<i32x4, u32x4, f64x4>>();
-
-    CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f });
-    CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 });
-    CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 });
-
-    CHECK(static_cast<i32x4>(u16x4{ 1, 2, 3, 4 }) == i32x4{ 1, 2, 3, 4 });
-
-    CHECK(static_cast<complex<float>>(10.f) == complex<float>{ 10.f, 0.f });
-    CHECK(static_cast<complex<double>>(10.f) == complex<double>{ 10., 0. });
-    CHECK(static_cast<complex<double>>(static_cast<short>(10)) == complex<double>{ 10., 0. });
-
-    CHECK(static_cast<vec<complex<float>, 4>>(complex<float>{ 1.f, 2.f }) ==
-          vec<complex<float>, 4>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f } });
-
-    CHECK(static_cast<vec<complex<double>, 2>>(vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }) ==
-          vec<complex<double>, 2>{ c64{ 1., 2. }, c64{ 1., 2. } });
-
-    CHECK(static_cast<vec<vec<double, 4>, 2>>(vec<vec<float, 4>, 2>{
-              vec<float, 4>{ 1.f, 2.f, 3.f, 4.f }, vec<float, 4>{ 11.f, 22.f, 33.f, 44.f } }) ==
-          vec<vec<double, 4>, 2>{ vec<double, 4>{ 1., 2., 3., 4. }, vec<double, 4>{ 11., 22., 33., 44. } });
-}
-
-TEST(transcendental)
-{
-    CHECK(kfr::sin(1.0f) == 0.8414709848078965066525023216303f);
-    CHECK(kfr::sin(1.0) == 0.8414709848078965066525023216303);
-
-    CHECK(kfr::cos(1.0f) == 0.54030230586813971740093660744298f);
-    CHECK(kfr::cos(1.0) == 0.54030230586813971740093660744298);
-
-    CHECK(kfr::tan(1.0f) == 1.5574077246549022305069748074584f);
-    CHECK(kfr::tan(1.0) == 1.5574077246549022305069748074584);
-
-    CHECK(kfr::asin(0.45f) == 0.46676533904729636185033976030414f);
-    CHECK(kfr::asin(0.45) == 0.46676533904729636185033976030414);
-
-    CHECK(kfr::acos(0.45f) == 1.1040309877476002573809819313356f);
-    CHECK(kfr::acos(0.45) == 1.1040309877476002573809819313356);
-
-    CHECK(kfr::atan(0.45f) == 0.42285392613294071296648279098114f);
-    CHECK(kfr::atan(0.45) == 0.42285392613294071296648279098114);
-
-    CHECK(kfr::sinh(1.0f) == 1.1752011936438014568823818505956f);
-    CHECK(kfr::sinh(1.0) == 1.1752011936438014568823818505956);
-
-    CHECK(kfr::cosh(1.0f) == 1.5430806348152437784779056207571f);
-    CHECK(kfr::cosh(1.0) == 1.5430806348152437784779056207571);
-
-    CHECK(kfr::tanh(1.0f) == 0.76159415595576488811945828260479f);
-    CHECK(kfr::tanh(1.0) == 0.76159415595576488811945828260479);
-
-    CHECK(kfr::exp(0.75f) == 2.1170000166126746685453698198371f);
-    CHECK(kfr::exp(0.75) == 2.1170000166126746685453698198371);
-
-    CHECK(kfr::exp(-0.75f) == 0.47236655274101470713804655094327f);
-    CHECK(kfr::exp(-0.75) == 0.47236655274101470713804655094327);
-
-    CHECK(kfr::log(2.45f) == 0.89608802455663561677548191074382f);
-    CHECK(kfr::log(2.45) == 0.89608802455663561677548191074382);
-}
-
-TEST(horner)
-{
-    CHECK(horner(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 17, 34));
-    CHECK(horner_odd(pack(0, 1, 2, 3), 1, 2, 3) == pack(0, 6, 114, 786));
-    CHECK(horner_even(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 57, 262));
-}
-
-TEST(test_stat)
-{
-    {
-        univector<float, 5> a({ 1, 2, 3, 4, 5 });
-        CHECK(sum(a) == 15);
-        CHECK(mean(a) == 3);
-        CHECK(minof(a) == 1);
-        CHECK(maxof(a) == 5);
-        CHECK(sumsqr(a) == 55);
-        CHECK(rms(a) == 3.316624790355399849115f);
-        CHECK(product(a) == 120);
-    }
-    {
-        univector<double, 5> a({ 1, 2, 3, 4, 5 });
-        CHECK(sum(a) == 15);
-        CHECK(mean(a) == 3);
-        CHECK(minof(a) == 1);
-        CHECK(maxof(a) == 5);
-        CHECK(sumsqr(a) == 55);
-        CHECK(rms(a) == 3.316624790355399849115);
-        CHECK(product(a) == 120);
-    }
-    {
-        univector<int, 5> a({ 1, 2, 3, 4, 5 });
-        CHECK(sum(a) == 15);
-        CHECK(mean(a) == 3);
-        CHECK(minof(a) == 1);
-        CHECK(maxof(a) == 5);
-        CHECK(sumsqr(a) == 55);
-        CHECK(product(a) == 120);
-    }
-    {
-        univector<complex<float>, 5> a({ 1, 2, 3, 4, 5 });
-        CHECK(sum(a) == c32{ 15 });
-        CHECK(mean(a) == c32{ 3 });
-        CHECK(sumsqr(a) == c32{ 55 });
-        CHECK(product(a) == c32{ 120 });
-    }
-}
-
-TEST(sample_conversion)
-{
-    CHECK(convert_sample<float>(static_cast<i8>(-127)) == -1.f);
-    CHECK(convert_sample<float>(static_cast<i8>(0)) == 0.f);
-    CHECK(convert_sample<float>(static_cast<i8>(127)) == 1.f);
-
-    CHECK(convert_sample<float>(static_cast<i16>(-32767)) == -1.f);
-    CHECK(convert_sample<float>(static_cast<i16>(0)) == 0.f);
-    CHECK(convert_sample<float>(static_cast<i16>(32767)) == 1.f);
-
-    CHECK(convert_sample<float>(static_cast<i24>(-8388607)) == -1.f);
-    CHECK(convert_sample<float>(static_cast<i24>(0)) == 0.f);
-    CHECK(convert_sample<float>(static_cast<i24>(8388607)) == 1.f);
-
-    CHECK(convert_sample<float>(static_cast<i32>(-2147483647)) == -1.f);
-    CHECK(convert_sample<float>(static_cast<i32>(0)) == 0.f);
-    CHECK(convert_sample<float>(static_cast<i32>(2147483647)) == 1.f);
-
-    CHECK(convert_sample<i8>(-1.f) == -127);
-    CHECK(convert_sample<i8>(0.f) == 0);
-    CHECK(convert_sample<i8>(1.f) == 127);
-
-    CHECK(convert_sample<i16>(-1.f) == -32767);
-    CHECK(convert_sample<i16>(0.f) == 0);
-    CHECK(convert_sample<i16>(1.f) == 32767);
-
-    CHECK(convert_sample<i24>(-1.f) == -8388607);
-    CHECK(convert_sample<i24>(0.f) == 0);
-    CHECK(convert_sample<i24>(1.f) == 8388607);
-
-    CHECK(convert_sample<i32>(-1.f) == -2147483647);
-    CHECK(convert_sample<i32>(0.f) == 0);
-    CHECK(convert_sample<i32>(1.f) == 2147483647);
-}
-
-TEST(sample_interleave_deinterleave)
-{
-    const size_t size = 50;
-    univector2d<float> in;
-    in.push_back(truncate(counter() * 3.f + 0.f, size));
-    in.push_back(truncate(counter() * 3.f + 1.f, size));
-    in.push_back(truncate(counter() * 3.f + 2.f, size));
-    univector<float> out(size * 3);
-    interleave(out.data(), (const float* []){ in[0].data(), in[1].data(), in[2].data() }, 3, size);
-    CHECK(maxof(out - render(counter() * 1.f, out.size())) == 0);
-
-    deinterleave((float* []){ in[0].data(), in[1].data(), in[2].data() }, out.data(), 3, size);
-
-    CHECK(absmaxof(in[0] - render(counter() * 3.f + 0.f, size)) == 0);
-    CHECK(absmaxof(in[1] - render(counter() * 3.f + 1.f, size)) == 0);
-    CHECK(absmaxof(in[2] - render(counter() * 3.f + 2.f, size)) == 0);
-}
+} // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
 int main()
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -11,6 +11,9 @@
 
 using namespace kfr;
 
+namespace CMT_ARCH_NAME
+{
+
 TEST(complex_vector)
 {
     const vec<c32, 1> c32x1{ c32{ 0, 1 } };
@@ -68,9 +71,11 @@ TEST(complex_math)
 {
     const vec<c32, 1> a{ c32{ 1, 2 } };
     const vec<c32, 1> b{ c32{ 3, 4 } };
+    CHECK(c32(vec<c32, 1>(2)[0]) == c32{ 2, 0 });
     CHECK(a + b == make_vector(c32{ 4, 6 }));
     CHECK(a - b == make_vector(c32{ -2, -2 }));
     CHECK(a * b == make_vector(c32{ -5, 10 }));
+    CHECK(a * vec<c32, 1>(2) == make_vector(c32{ 2, 4 }));
     CHECK(a * 2 == make_vector(c32{ 2, 4 }));
     CHECK(a / b == make_vector(c32{ 0.44f, 0.08f }));
     CHECK(-a == make_vector(c32{ -1, -2 }));
@@ -88,8 +93,7 @@ TEST(complex_math)
     CHECK(cabs(-3.f) == 3.f);
     CHECK(cabs(make_vector(-3.f)) == make_vector(3.f));
 
-    testo::epsilon<f32>() *= 5;
-    testo::epsilon<f64>() *= 5;
+    testo::eplison_scope<void> eps(5);
 
     CHECK(csin(c32{ 1.f, 1.f }) == c32{ 1.2984575814159773f, 0.634963914784736f });
     CHECK(ccos(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489f, -0.9888977057628651f });
@@ -176,13 +180,6 @@ TEST(complex_function_expressions)
 
 TEST(static_tests)
 {
-#ifdef CMT_ARCH_SSE2
-    static_assert(platform<f32, cpu_t::sse2>::vector_width == 4, "");
-    static_assert(platform<c32, cpu_t::sse2>::vector_width == 2, "");
-    static_assert(platform<i32, cpu_t::sse2>::vector_width == 4, "");
-    static_assert(platform<complex<i32>, cpu_t::sse2>::vector_width == 2, "");
-#endif
-
     static_assert(is_numeric<vec<complex<float>, 4>>::value, "");
     static_assert(is_numeric_args<vec<complex<float>, 4>>::value, "");
 
@@ -207,8 +204,9 @@ TEST(static_tests)
     testo::assert_is_same<kfr::internal::arg<complex<int>>,
                           kfr::internal::expression_scalar<kfr::complex<int>, 1>>();
 
-    testo::assert_is_same<common_type<complex<int>, double>, complex<double>>();
+    testo::assert_is_same<kfr::common_type<complex<int>, double>, complex<double>>();
 }
+} // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
 int main()
diff --git a/tests/data/acos_double_fuzz b/tests/data/acos_double_fuzz
Binary files differ.
diff --git a/tests/data/acos_double_narrow b/tests/data/acos_double_narrow
Binary files differ.
diff --git a/tests/data/acos_float_fuzz b/tests/data/acos_float_fuzz
Binary files differ.
diff --git a/tests/data/acos_float_narrow b/tests/data/acos_float_narrow
Binary files differ.
diff --git a/tests/data/asin_double_fuzz b/tests/data/asin_double_fuzz
Binary files differ.
diff --git a/tests/data/asin_double_narrow b/tests/data/asin_double_narrow
Binary files differ.
diff --git a/tests/data/asin_float_fuzz b/tests/data/asin_float_fuzz
Binary files differ.
diff --git a/tests/data/asin_float_narrow b/tests/data/asin_float_narrow
Binary files differ.
diff --git a/tests/data/atan2_double_fuzz b/tests/data/atan2_double_fuzz
Binary files differ.
diff --git a/tests/data/atan2_double_narrow b/tests/data/atan2_double_narrow
Binary files differ.
diff --git a/tests/data/atan2_float_fuzz b/tests/data/atan2_float_fuzz
Binary files differ.
diff --git a/tests/data/atan2_float_narrow b/tests/data/atan2_float_narrow
Binary files differ.
diff --git a/tests/data/atan_double_fuzz b/tests/data/atan_double_fuzz
Binary files differ.
diff --git a/tests/data/atan_double_narrow b/tests/data/atan_double_narrow
Binary files differ.
diff --git a/tests/data/atan_float_fuzz b/tests/data/atan_float_fuzz
Binary files differ.
diff --git a/tests/data/atan_float_narrow b/tests/data/atan_float_narrow
Binary files differ.
diff --git a/tests/data/cbrt_double_fuzz b/tests/data/cbrt_double_fuzz
Binary files differ.
diff --git a/tests/data/cbrt_double_narrow b/tests/data/cbrt_double_narrow
Binary files differ.
diff --git a/tests/data/cbrt_float_fuzz b/tests/data/cbrt_float_fuzz
Binary files differ.
diff --git a/tests/data/cbrt_float_narrow b/tests/data/cbrt_float_narrow
Binary files differ.
diff --git a/tests/data/cos_double_fuzz b/tests/data/cos_double_fuzz
Binary files differ.
diff --git a/tests/data/cos_double_narrow b/tests/data/cos_double_narrow
Binary files differ.
diff --git a/tests/data/cos_float_fuzz b/tests/data/cos_float_fuzz
Binary files differ.
diff --git a/tests/data/cos_float_narrow b/tests/data/cos_float_narrow
Binary files differ.
diff --git a/tests/data/cosh_double_fuzz b/tests/data/cosh_double_fuzz
Binary files differ.
diff --git a/tests/data/cosh_double_narrow b/tests/data/cosh_double_narrow
Binary files differ.
diff --git a/tests/data/cosh_float_fuzz b/tests/data/cosh_float_fuzz
Binary files differ.
diff --git a/tests/data/cosh_float_narrow b/tests/data/cosh_float_narrow
Binary files differ.
diff --git a/tests/data/coth_double_fuzz b/tests/data/coth_double_fuzz
Binary files differ.
diff --git a/tests/data/coth_double_narrow b/tests/data/coth_double_narrow
Binary files differ.
diff --git a/tests/data/coth_float_fuzz b/tests/data/coth_float_fuzz
Binary files differ.
diff --git a/tests/data/coth_float_narrow b/tests/data/coth_float_narrow
Binary files differ.
diff --git a/tests/data/exp10_double_fuzz b/tests/data/exp10_double_fuzz
Binary files differ.
diff --git a/tests/data/exp10_double_narrow b/tests/data/exp10_double_narrow
Binary files differ.
diff --git a/tests/data/exp10_float_fuzz b/tests/data/exp10_float_fuzz
Binary files differ.
diff --git a/tests/data/exp10_float_narrow b/tests/data/exp10_float_narrow
Binary files differ.
diff --git a/tests/data/exp2_double_fuzz b/tests/data/exp2_double_fuzz
Binary files differ.
diff --git a/tests/data/exp2_double_narrow b/tests/data/exp2_double_narrow
Binary files differ.
diff --git a/tests/data/exp2_float_fuzz b/tests/data/exp2_float_fuzz
Binary files differ.
diff --git a/tests/data/exp2_float_narrow b/tests/data/exp2_float_narrow
Binary files differ.
diff --git a/tests/data/exp_double_fuzz b/tests/data/exp_double_fuzz
Binary files differ.
diff --git a/tests/data/exp_double_narrow b/tests/data/exp_double_narrow
Binary files differ.
diff --git a/tests/data/exp_float_fuzz b/tests/data/exp_float_fuzz
Binary files differ.
diff --git a/tests/data/exp_float_narrow b/tests/data/exp_float_narrow
Binary files differ.
diff --git a/tests/data/gamma_double_fuzz b/tests/data/gamma_double_fuzz
Binary files differ.
diff --git a/tests/data/gamma_double_narrow b/tests/data/gamma_double_narrow
Binary files differ.
diff --git a/tests/data/gamma_float_fuzz b/tests/data/gamma_float_fuzz
Binary files differ.
diff --git a/tests/data/gamma_float_narrow b/tests/data/gamma_float_narrow
Binary files differ.
diff --git a/tests/data/log10_double_fuzz b/tests/data/log10_double_fuzz
Binary files differ.
diff --git a/tests/data/log10_double_narrow b/tests/data/log10_double_narrow
Binary files differ.
diff --git a/tests/data/log10_float_fuzz b/tests/data/log10_float_fuzz
Binary files differ.
diff --git a/tests/data/log10_float_narrow b/tests/data/log10_float_narrow
Binary files differ.
diff --git a/tests/data/log2_double_fuzz b/tests/data/log2_double_fuzz
Binary files differ.
diff --git a/tests/data/log2_double_narrow b/tests/data/log2_double_narrow
Binary files differ.
diff --git a/tests/data/log2_float_fuzz b/tests/data/log2_float_fuzz
Binary files differ.
diff --git a/tests/data/log2_float_narrow b/tests/data/log2_float_narrow
Binary files differ.
diff --git a/tests/data/log_double_fuzz b/tests/data/log_double_fuzz
Binary files differ.
diff --git a/tests/data/log_double_narrow b/tests/data/log_double_narrow
Binary files differ.
diff --git a/tests/data/log_float_fuzz b/tests/data/log_float_fuzz
Binary files differ.
diff --git a/tests/data/log_float_narrow b/tests/data/log_float_narrow
Binary files differ.
diff --git a/tests/data/sin_double_fuzz b/tests/data/sin_double_fuzz
Binary files differ.
diff --git a/tests/data/sin_double_narrow b/tests/data/sin_double_narrow
Binary files differ.
diff --git a/tests/data/sin_float_fuzz b/tests/data/sin_float_fuzz
Binary files differ.
diff --git a/tests/data/sin_float_narrow b/tests/data/sin_float_narrow
Binary files differ.
diff --git a/tests/data/sinh_double_fuzz b/tests/data/sinh_double_fuzz
Binary files differ.
diff --git a/tests/data/sinh_double_narrow b/tests/data/sinh_double_narrow
Binary files differ.
diff --git a/tests/data/sinh_float_fuzz b/tests/data/sinh_float_fuzz
Binary files differ.
diff --git a/tests/data/sinh_float_narrow b/tests/data/sinh_float_narrow
Binary files differ.
diff --git a/tests/data/tan_double_fuzz b/tests/data/tan_double_fuzz
Binary files differ.
diff --git a/tests/data/tan_double_narrow b/tests/data/tan_double_narrow
Binary files differ.
diff --git a/tests/data/tan_float_fuzz b/tests/data/tan_float_fuzz
Binary files differ.
diff --git a/tests/data/tan_float_narrow b/tests/data/tan_float_narrow
Binary files differ.
diff --git a/tests/data/tanh_double_fuzz b/tests/data/tanh_double_fuzz
Binary files differ.
diff --git a/tests/data/tanh_double_narrow b/tests/data/tanh_double_narrow
Binary files differ.
diff --git a/tests/data/tanh_float_fuzz b/tests/data/tanh_float_fuzz
Binary files differ.
diff --git a/tests/data/tanh_float_narrow b/tests/data/tanh_float_narrow
Binary files differ.
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -14,6 +14,9 @@
 
 using namespace kfr;
 
+namespace CMT_ARCH_NAME
+{
+
 #ifdef KFR_NATIVE_F64
 constexpr ctypes_t<float, double> dft_float_types{};
 #else
@@ -25,7 +28,7 @@ TEST(test_convolve)
     univector<fbase, 5> a({ 1, 2, 3, 4, 5 });
     univector<fbase, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
     univector<fbase> c = convolve(a, b);
-    CHECK(c.size() == 9);
+    CHECK(c.size() == 9u);
     CHECK(rms(c - univector<fbase>({ 0.25, 1., 2.75, 2.5, 3.75, 3.5, 1.5, -4., 7.5 })) < 0.0001);
 }
 
@@ -44,7 +47,7 @@ TEST(test_correlate)
     univector<fbase, 5> a({ 1, 2, 3, 4, 5 });
     univector<fbase, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
     univector<fbase> c = correlate(a, b);
-    CHECK(c.size() == 9);
+    CHECK(c.size() == 9u);
     CHECK(rms(c - univector<fbase>({ 1.5, 1., 1.5, 2.5, 3.75, -4., 7.75, 3.5, 1.25 })) < 0.0001);
 }
 
@@ -87,58 +90,60 @@ TEST(fft_accuracy)
 #endif
     println(sizes);
 
-    testo::matrix(
-        named("type") = dft_float_types, //
-        named("size") = sizes, //
-        [&gen](auto type, size_t size) {
-            using float_type      = type_of<decltype(type)>;
-            const double min_prec = 0.000001 * std::log(size) * size;
-
-            for (bool inverse : { false, true })
-            {
-                testo::active_test()->append_comment(inverse ? "complex-inverse" : "complex-direct");
-                univector<complex<float_type>> in =
-                    truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
-                univector<complex<float_type>> out    = in;
-                univector<complex<float_type>> refout = out;
-                univector<complex<float_type>> outo   = in;
-                const dft_plan<float_type> dft(size);
-                univector<u8> temp(dft.temp_size);
-
-                reference_dft(refout.data(), in.data(), size, inverse);
-                dft.execute(outo, in, temp, inverse);
-                dft.execute(out, out, temp, inverse);
-
-                const float_type rms_diff_inplace = rms(cabs(refout - out));
-                CHECK(rms_diff_inplace < min_prec);
-                const float_type rms_diff_outofplace = rms(cabs(refout - outo));
-                CHECK(rms_diff_outofplace < min_prec);
-            }
-
-            if (size >= 4 && is_poweroftwo(size))
-            {
-                univector<float_type> in = truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
-
-                univector<complex<float_type>> out    = truncate(scalar(qnan), size);
-                univector<complex<float_type>> refout = truncate(scalar(qnan), size);
-                const dft_plan_real<float_type> dft(size);
-                univector<u8> temp(dft.temp_size);
-
-                testo::active_test()->append_comment("real-direct");
-                reference_fft(refout.data(), in.data(), size);
-                dft.execute(out, in, temp);
-                float_type rms_diff = rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
-                CHECK(rms_diff < min_prec);
-
-                univector<float_type> out2(size, 0.f);
-                testo::active_test()->append_comment("real-inverse");
-                dft.execute(out2, out, temp);
-                out2     = out2 / size;
-                rms_diff = rms(in - out2);
-                CHECK(rms_diff < min_prec);
-            }
-        });
+    testo::matrix(named("type") = dft_float_types, //
+                  named("size") = sizes, //
+                  [&gen](auto type, size_t size) {
+                      using float_type      = type_of<decltype(type)>;
+                      const double min_prec = 0.000001 * std::log(size) * size;
+
+                      for (bool inverse : { false, true })
+                      {
+                          testo::scope s(inverse ? "complex-inverse" : "complex-direct");
+                          univector<complex<float_type>> in =
+                              truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+                          univector<complex<float_type>> out    = in;
+                          univector<complex<float_type>> refout = out;
+                          univector<complex<float_type>> outo   = in;
+                          const dft_plan<float_type> dft(size);
+                          univector<u8> temp(dft.temp_size);
+
+                          reference_dft(refout.data(), in.data(), size, inverse);
+                          dft.execute(outo, in, temp, inverse);
+                          dft.execute(out, out, temp, inverse);
+
+                          const float_type rms_diff_inplace = rms(cabs(refout - out));
+                          CHECK(rms_diff_inplace < min_prec);
+                          const float_type rms_diff_outofplace = rms(cabs(refout - outo));
+                          CHECK(rms_diff_outofplace < min_prec);
+                      }
+
+                      if (size >= 4 && is_poweroftwo(size))
+                      {
+                          univector<float_type> in =
+                              truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+
+                          univector<complex<float_type>> out    = truncate(scalar(qnan), size);
+                          univector<complex<float_type>> refout = truncate(scalar(qnan), size);
+                          const dft_plan_real<float_type> dft(size);
+                          univector<u8> temp(dft.temp_size);
+
+                          testo::scope s("real-direct");
+                          reference_fft(refout.data(), in.data(), size);
+                          dft.execute(out, in, temp);
+                          float_type rms_diff =
+                              rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
+                          CHECK(rms_diff < min_prec);
+
+                          univector<float_type> out2(size, 0.f);
+                          s.text = "real-inverse";
+                          dft.execute(out2, out, temp);
+                          out2     = out2 / size;
+                          rms_diff = rms(in - out2);
+                          CHECK(rms_diff < min_prec);
+                      }
+                  });
 }
+} // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
 int main()
diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp
@@ -15,6 +15,9 @@
 
 using namespace kfr;
 
+namespace CMT_ARCH_NAME
+{
+
 struct TestFragment
 {
     float gain; // dB
@@ -235,6 +238,13 @@ TEST(ebu_lra_1_2_3_and_4)
                   });
 }
 
+TEST(note_to_hertz)
+{
+    testo::eplison_scope<void> eps(1000);
+    CHECK(kfr::note_to_hertz(60) == fbase(261.6255653005986346778499935233));
+    CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233)));
+}
+
 TEST(delay)
 {
     const univector<float, 33> v1 = counter() + 100;
@@ -265,7 +275,7 @@ TEST(mixdown)
                      [](size_t i) { return i + i * 2 + 100; });
 }
 
-#ifdef CMT_COMPILER_CLANG
+#ifdef CMT_COMPILER_CLANG__
 TEST(mixdown_stereo)
 {
     const univector<double, 21> left  = counter();
@@ -289,29 +299,85 @@ TEST(phasor)
 
 TEST(fir)
 {
-    const univector<double, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
-    const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
-
-    CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> double {
-        double result = 0.0;
-        for (size_t i = 0; i < taps.size(); i++)
-            result += data.get(index - i, 0.0) * taps[i];
-        return result;
-    });
+#ifdef CMT_COMPILER_MSVC
+    // testo::matrix causes error in MSVC
+    {
+        using T = float;
+
+        const univector<T, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
+        const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+
+        CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T {
+            T result = 0;
+            for (size_t i = 0; i < taps.size(); i++)
+                result += data.get(index - i, 0) * taps[i];
+            return result;
+        });
+
+        CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T {
+            T result = 0;
+            for (size_t i = 0; i < taps.size(); i++)
+                result += data.get(index - i, 0) * taps[i];
+            return result;
+        });
+    }
+    {
+        using T = double;
+
+        const univector<T, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
+        const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+
+        CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T {
+            T result = 0;
+            for (size_t i = 0; i < taps.size(); i++)
+                result += data.get(index - i, 0) * taps[i];
+            return result;
+        });
+
+        CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T {
+            T result = 0;
+            for (size_t i = 0; i < taps.size(); i++)
+                result += data.get(index - i, 0) * taps[i];
+            return result;
+        });
+    }
+#else
+    testo::matrix(named("type") = ctypes_t<float
+#ifdef KFR_NATIVE_F64
+                                           ,
+                                           double
+#endif
+                                           >{},
+                  [](auto type) {
+                      using T = type_of<decltype(type)>;
 
-    CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> double {
-        double result = 0.0;
-        for (size_t i = 0; i < taps.size(); i++)
-            result += data.get(index - i, 0.0) * taps[i];
-        return result;
-    });
+                      const univector<T, 100> data =
+                          counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
+                      const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+
+                      CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T {
+                          T result = 0;
+                          for (size_t i = 0; i < taps.size(); i++)
+                              result += data.get(index - i, 0) * taps[i];
+                          return result;
+                      });
+
+                      CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T {
+                          T result = 0;
+                          for (size_t i = 0; i < taps.size(); i++)
+                              result += data.get(index - i, 0) * taps[i];
+                          return result;
+                      });
+                  });
+#endif
 }
 
 #ifdef KFR_NATIVE_F64
 TEST(fir_different)
 {
     const univector<float, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5f);
-    const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+    //    const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+    const univector<double, 4> taps{ 1, 2, 3, 4 };
 
     CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> float {
         double result = 0.0;
@@ -375,6 +441,114 @@ TEST(fir_complex)
     });
 }
 
+template <typename E, typename T, size_t size>
+void test_ir(E&& e, const univector<T, size>& test_vector)
+{
+    substitute(e, to_pointer(unitimpulse<T>()));
+    const univector<T, size> ir = e;
+    println(absmaxof(ir - test_vector));
+}
+
+template <typename T, typename... Ts, univector_tag Tag>
+inline const univector<T, Tag>& choose_array(const univector<T, Tag>& array, const univector<Ts, Tag>&...)
+{
+    return array;
+}
+
+template <typename T, typename T2, typename... Ts, univector_tag Tag, KFR_ENABLE_IF(!is_same<T, T2>::value)>
+inline const univector<T, Tag>& choose_array(const univector<T2, Tag>&, const univector<Ts, Tag>&... arrays)
+{
+    return choose_array<T>(arrays...);
+}
+
+TEST(biquad_lowpass1)
+{
+    testo::matrix(named("type") = ctypes_t<float, double>{}, [](auto type) {
+        using T = type_of<decltype(type)>;
+
+        const biquad_params<T> bq = biquad_lowpass<T>(0.1, 0.7);
+
+        constexpr size_t size = 32;
+
+        const univector<float, size> test_vector_f32{
+            +0x8.9bce2p-7,  +0xd.8383ep-6,  +0x8.f908dp-5,  +0xe.edc21p-6,  +0x9.ae104p-6,  +0x9.dcc24p-7,
+            +0xd.50584p-9,  -0xf.2668p-13,  -0xd.09ca1p-10, -0xe.15995p-10, -0xa.b90d2p-10, -0xc.edea4p-11,
+            -0xb.f14eap-12, -0xc.2cb44p-14, +0xb.4a4dep-15, +0xb.685dap-14, +0xa.b181fp-14, +0xf.0cb2bp-15,
+            +0x8.695d6p-15, +0xd.bedd4p-17, +0xf.5474p-20,  -0xd.bb266p-19, -0x9.63ca1p-18, -0xf.ca567p-19,
+            -0xa.5231p-19,  -0xa.9e934p-20, -0xe.ab52p-22,  +0xa.3c4cp-26,  +0xd.721ffp-23, +0xe.ccc1ap-23,
+            +0xb.5f248p-23, +0xd.d2c9ap-24,
+        };
+
+        const univector<double, size> test_vector_f64{
+            +0x8.9bce2bf3663e8p-7,  +0xd.8384010fdf1dp-6,   +0x8.f908e7a36df6p-5,   +0xe.edc2332a6d0bp-6,
+            +0x9.ae104af1da9ap-6,   +0x9.dcc235ef68e7p-7,   +0xd.5057ee425e05p-9,   -0xf.266e42a99aep-13,
+            -0xd.09cad73642208p-10, -0xe.1599f32a83dp-10,   -0xa.b90d8910a117p-10,  -0xc.edeaabb890948p-11,
+            -0xb.f14edbb55383p-12,  -0xc.2cb39b86f2dap-14,  +0xb.4a506ecff055p-15,  +0xb.685edfdb55358p-14,
+            +0xa.b182e32f8e298p-14, +0xf.0cb3dfd894b2p-15,  +0x8.695df725b4438p-15, +0xd.beddc3606b9p-17,
+            +0xf.547004d20874p-20,  -0xd.bb29b25b49b6p-19,  -0x9.63cb9187da1dp-18,  -0xf.ca588634fc618p-19,
+            -0xa.52322d320da78p-19, -0xa.9e9420154e4p-20,   -0xe.ab51f7b0335ap-22,  +0xa.3c6479980e1p-26,
+            +0xd.7223836599fp-23,   +0xe.ccc47ddd18678p-23, +0xb.5f265b1be1728p-23, +0xd.d2cb83f8483f8p-24,
+        };
+
+        const univector<T, size> ir = biquad(bq, unitimpulse<T>());
+
+        CHECK(absmaxof(choose_array<T>(test_vector_f32, test_vector_f64) - ir) == 0);
+    });
+}
+
+TEST(biquad_lowpass2)
+{
+    testo::matrix(named("type") = ctypes_t<float, double>{}, [](auto type) {
+        using T = type_of<decltype(type)>;
+
+        const biquad_params<T> bq = biquad_lowpass<T>(0.45, 0.2);
+
+        constexpr size_t size = 32;
+
+        const univector<float, size> test_vector_f32{
+            +0x8.ce416p-4,  +0x8.2979p-4,   -0x8.a9d04p-7,  +0xe.aeb3p-11,  +0x8.204f8p-13, -0x8.20d78p-12,
+            +0x8.3379p-12,  -0xf.83d81p-13, +0xe.8b5c4p-13, -0xd.9ddadp-13, +0xc.bedfcp-13, -0xb.ee123p-13,
+            +0xb.2a9e5p-13, -0xa.73ac4p-13, +0x9.c86f6p-13, -0x9.2828p-13,  +0x8.92229p-13, -0x8.05b7p-13,
+            +0xf.048ffp-14, -0xe.0e849p-14, +0xd.28384p-14, -0xc.50a9p-14,  +0xb.86e56p-14, -0xa.ca0b6p-14,
+            +0xa.19476p-14, -0x9.73d38p-14, +0x8.d8f64p-14, -0x8.48024p-14, +0xf.80aa2p-15, -0xe.82ad8p-15,
+            +0xd.94f22p-15, -0xc.b66d9p-15,
+        };
+
+        const univector<double, size> test_vector_f64{
+            +0x8.ce416c0d31e88p-4,  +0x8.2978efe51dafp-4,   -0x8.a9d088b81da6p-7,   +0xe.aeb56c029358p-11,
+            +0x8.20492639873ap-13,  -0x8.20d4e21aab538p-12, +0x8.3376b2d53b4a8p-12, -0xf.83d3d1c17343p-13,
+            +0xe.8b584f0dd5ac8p-13, -0xd.9dd740ceaacf8p-13, +0xc.bedc85e7a621p-13,  -0xb.ee0f472bf8968p-13,
+            +0xb.2a9baed1fe6cp-13,  -0xa.73a9d1670f4ep-13,  +0x9.c86d29d297798p-13, -0x9.2825f4d894088p-13,
+            +0x8.9220a956d651p-13,  -0x8.05b539fdd79e8p-13, +0xf.048cb5194cfa8p-14, -0xe.0e819fa128938p-14,
+            +0xd.2835957d684cp-14,  -0xc.50a69c2a8dc18p-14, +0xb.86e33bbaf3cbp-14,  -0xa.ca097058af2cp-14,
+            +0xa.1945ad1703dcp-14,  -0x9.73d1eef7d8b68p-14, +0x8.d8f4df1bb3efp-14,  -0x8.48010323c6f7p-14,
+            +0xf.80a7f5baeeb2p-15,  -0xe.82ab94bb68a8p-15,  +0xd.94f05f80af008p-15, -0xc.b66c0799b21a8p-15,
+        };
+
+        const univector<T, size> ir = biquad(bq, unitimpulse<T>());
+
+        CHECK(absmaxof(choose_array<T>(test_vector_f32, test_vector_f64) - ir) == 0);
+    });
+}
+
+TEST(resampler_test)
+{
+    const int in_sr  = 44100;
+    const int out_sr = 48000;
+    const int freq   = 100;
+    auto resampler   = sample_rate_converter<fbase>(resample_quality::draft, out_sr, in_sr);
+    double delay     = resampler.get_fractional_delay();
+    univector<fbase> out(out_sr / 10);
+    univector<fbase> in  = truncate(sin(c_pi<fbase> * phasor<fbase>(freq, in_sr, 0)), in_sr / 10);
+    univector<fbase> ref = truncate(
+        sin(c_pi<fbase> * phasor<fbase>(freq, out_sr, -delay * (static_cast<double>(freq) / out_sr))),
+        out_sr / 10);
+    resampler.process(out, in);
+
+    CHECK(rms(slice(out - ref, static_cast<size_t>(ceil(delay * 2)))) < 0.005f);
+}
+} // namespace CMT_ARCH_NAME
+
 #ifndef KFR_NO_MAIN
 int main()
 {
diff --git a/tests/ebu_test.cpp b/tests/ebu_test.cpp
@@ -1,122 +0,0 @@
-/**
- * KFR (http://kfrlib.com)
- * Copyright (C) 2016  D Levin
- * See LICENSE.txt for details
- */
-
-#include <kfr/testo/testo.hpp>
-
-#include <kfr/base.hpp>
-#include <kfr/dsp.hpp>
-#include <kfr/io.hpp>
-
-using namespace kfr;
-
-int main(int argc, char** argv)
-{
-    if (argc < 3)
-    {
-        println("Usage: ebu_test INPUT_IN_F32_RAW_FORMAT CHANNEL_NUMBER");
-        return 1;
-    }
-
-    // Prepare
-    FILE* f                  = fopen(argv[1], "rb");
-    const int channel_number = atoi(argv[2]);
-    if (channel_number < 1 || channel_number > 6)
-    {
-        println("Incorrect number of channels");
-        return 1;
-    }
-    fseek(f, 0, SEEK_END);
-    uintmax_t size = ftell(f);
-    fseek(f, 0, SEEK_SET);
-    if (size % (sizeof(float) * channel_number))
-    {
-        println("Incorrect file size");
-        return 1;
-    }
-
-    // Read file
-    const size_t length = size / (sizeof(float) * channel_number);
-    univector<float> interleaved(size / sizeof(float));
-    size_t read_len = fread(interleaved.data(), 1, size, f);
-    if (read_len != size)
-    {
-        println("Can't read file");
-        return 1;
-    }
-
-    // Deinterleave
-    univector<univector<float>> data(channel_number, univector<float>(length));
-    for (size_t ch = 0; ch < channel_number; ++ch)
-    {
-        for (size_t i = 0; i < length; ++i)
-        {
-            data[ch][i] = interleaved[i * channel_number + ch];
-        }
-    }
-
-    std::vector<Speaker> speakers;
-    switch (channel_number)
-    {
-    case 1:
-        speakers = { Speaker::Mono };
-        break;
-    case 2:
-        speakers = { Speaker::Left, Speaker::Right };
-        break;
-    case 3:
-        speakers = { Speaker::Left, Speaker::Right, Speaker::Center };
-        break;
-    case 4:
-        speakers = { Speaker::Left, Speaker::Right, Speaker::LeftSurround, Speaker::RightSurround };
-        break;
-    case 5:
-        speakers = { Speaker::Left, Speaker::Right, Speaker::Center, Speaker::LeftSurround,
-                     Speaker::RightSurround };
-        break;
-    case 6:
-        speakers = { Speaker::Left,         Speaker::Right,         Speaker::Center,
-                     Speaker::LeftSurround, Speaker::RightSurround, Speaker::Lfe };
-        break;
-    }
-
-    ebu_r128<float> loudness(48000, speakers);
-
-    float M, S, I, RL, RH;
-    float maxM = -HUGE_VALF, maxS = -HUGE_VALF;
-    for (size_t i = 0; i < length / loudness.packet_size(); i++)
-    {
-        std::vector<univector_ref<float>> channels;
-        for (size_t ch = 0; ch < channel_number; ++ch)
-        {
-            channels.push_back(data[ch].slice(i * loudness.packet_size(), loudness.packet_size()));
-        }
-        loudness.process_packet(channels);
-        loudness.get_values(M, S, I, RL, RH);
-        maxM = std::max(maxM, M);
-        maxS = std::max(maxS, S);
-    }
-
-    {
-        // For file-based measurements, the signal should be followed by at least 1.5 s of silence
-        std::vector<univector_dyn<float>> channels(channel_number,
-                                                   univector_dyn<float>(loudness.packet_size()));
-        for (size_t i = 0; i < 15; ++i)
-            loudness.process_packet(channels);
-        float dummyM, dummyS, dummyI;
-        loudness.get_values(dummyM, dummyS, dummyI, RL, RH);
-    }
-
-    println(argv[1]);
-    println("M = ", M);
-    println("S = ", S);
-    println("I = ", I);
-    println("LRA = ", RH - RL);
-    println("maxM = ", maxM);
-    println("maxS = ", maxS);
-    println();
-
-    return 0;
-}
diff --git a/tests/empty_test.cpp b/tests/empty_test.cpp
@@ -1,5 +0,0 @@
-#include <kfr/all.hpp>
-
-using namespace kfr;
-
-int main() {}
diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp
@@ -13,6 +13,9 @@
 
 using namespace kfr;
 
+namespace CMT_ARCH_NAME
+{
+
 TEST(pack)
 {
     const univector<float, 21> v1 = 1 + counter();
@@ -59,6 +62,17 @@ TEST(test_arg_access)
     CHECK_EXPRESSION(e1, 10, [](size_t i) { return (i == 0 ? 100 : i) + 1; });
 }
 
+TEST(to_pointer)
+{
+    auto e1 = to_pointer(counter<float>());
+
+    CHECK_EXPRESSION(e1, infinite_size, [](size_t i) { return static_cast<float>(i); });
+
+    auto e2 = to_pointer(gen_linear(0.f, 1.f));
+
+    CHECK_EXPRESSION(e2, infinite_size, [](size_t i) { return static_cast<float>(i); });
+}
+
 TEST(test_arg_replace)
 {
     univector<float, 10> v1 = counter();
@@ -88,11 +102,11 @@ TEST(placeholders_pointer)
 TEST(univector_assignment)
 {
     univector<int> x = truncate(counter(), 10);
-    CHECK(x.size() == 10);
+    CHECK(x.size() == 10u);
 
     univector<int> y;
     y = truncate(counter(), 10);
-    CHECK(y.size() == 10);
+    CHECK(y.size() == 10u);
 }
 
 TEST(size_calc)
@@ -102,9 +116,9 @@ TEST(size_calc)
     auto b = slice(counter(), 100);
     CHECK(b.size() == infinite_size);
     auto c = slice(counter(), 100, 1000);
-    CHECK(c.size() == 1000);
+    CHECK(c.size() == 1000u);
     auto d = slice(c, 100);
-    CHECK(d.size() == 900);
+    CHECK(d.size() == 900u);
 }
 
 TEST(reverse)
@@ -126,8 +140,8 @@ TEST(partition)
     {
         univector<double, 385> output = zeros();
         auto result                   = partition(output, counter(), 5, 1);
-        CHECK(result.count == 5);
-        CHECK(result.chunk_size == 80);
+        CHECK(result.count == 5u);
+        CHECK(result.chunk_size == 80u);
 
         result(0);
         CHECK(sum(output) >= fast_range_sum(80 - 1));
@@ -144,8 +158,8 @@ TEST(partition)
     {
         univector<double, 385> output = zeros();
         auto result                   = partition(output, counter(), 5, 160);
-        CHECK(result.count == 3);
-        CHECK(result.chunk_size == 160);
+        CHECK(result.count == 3u);
+        CHECK(result.chunk_size == 160u);
 
         result(0);
         CHECK(sum(output) >= fast_range_sum(160 - 1));
@@ -155,6 +169,7 @@ TEST(partition)
         CHECK(sum(output) == fast_range_sum(385 - 1));
     }
 }
+} // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
 int main()
diff --git a/tests/generate_data.cpp b/tests/generate_data.cpp
@@ -0,0 +1,114 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+#define _USE_MATH_DEFINES
+
+#include "mpfr/mpfrplus.hpp"
+#include <kfr/cometa.hpp>
+#include <kfr/cometa/ctti.hpp>
+#include <kfr/cometa/function.hpp>
+#include <kfr/io/file.hpp>
+#include <random>
+
+constexpr size_t points      = 10000;
+constexpr size_t points_2arg = 100;
+
+constexpr size_t fuzz_points      = 10000;
+constexpr size_t fuzz_points_2arg = 100;
+
+using namespace kfr;
+
+using testo::test_data_entry;
+
+template <typename T>
+struct range_sampler
+{
+    double min;
+    double max;
+    T operator()(size_t i, size_t num) { return static_cast<T>(min + (max - min) * i / (points - 1)); }
+};
+
+template <typename T>
+struct fuzz_sampler
+{
+    std::mt19937_64 rnd{ 12345 };
+    T operator()(size_t i, size_t num) { return bitcast_anything<T>(static_cast<utype<T>>(rnd())); }
+};
+
+template <typename T, typename Sampler>
+void generate_table(const std::shared_ptr<file_writer<test_data_entry<T, 1>>>& writer,
+                    cometa::function<mpfr::number(const mpfr::number&)> func, Sampler&& sampler)
+{
+    for (size_t i = 0; i < points; i++)
+    {
+        test_data_entry<T, 1> entry;
+        entry.arguments[0] = sampler(i, points);
+        entry.result       = static_cast<T>(func(entry.arguments[0]));
+        writer->write(entry);
+    }
+}
+
+template <typename T, typename Sampler>
+void generate_table(const std::shared_ptr<file_writer<test_data_entry<T, 2>>>& writer,
+                    cometa::function<mpfr::number(const mpfr::number&, const mpfr::number&)> func,
+                    Sampler&& sampler)
+{
+    for (size_t i = 0; i < points_2arg; i++)
+    {
+        for (size_t j = 0; j < points_2arg; j++)
+        {
+            test_data_entry<T, 2> entry;
+            entry.arguments[0] = sampler(i, points_2arg);
+            entry.arguments[1] = sampler(j, points_2arg);
+            entry.result       = static_cast<T>(func(entry.arguments[0], entry.arguments[1]));
+            writer->write(entry);
+        }
+    }
+}
+
+template <int args, typename Func>
+void generate_test(cint_t<args>, const char* name, const Func& func, double min, double max)
+{
+    generate_table(open_file_for_writing<test_data_entry<float, args>>(as_string(name, "_float_narrow")),
+                   func, range_sampler<float>{ min, max });
+    generate_table(open_file_for_writing<test_data_entry<double, args>>(as_string(name, "_double_narrow")),
+                   func, range_sampler<double>{ min, max });
+
+    generate_table(open_file_for_writing<test_data_entry<float, args>>(as_string(name, "_float_fuzz")), func,
+                   fuzz_sampler<float>{});
+    generate_table(open_file_for_writing<test_data_entry<double, args>>(as_string(name, "_double_fuzz")),
+                   func, fuzz_sampler<double>{});
+}
+
+int main()
+{
+    using num = mpfr::number;
+    mpfr::scoped_precision prec(512);
+    generate_test(cint<1>, "sin", [](const num& x) { return mpfr::sin(x); }, 0, M_PI * 2);
+    generate_test(cint<1>, "cos", [](const num& x) { return mpfr::cos(x); }, 0, M_PI * 2);
+    generate_test(cint<1>, "tan", [](const num& x) { return mpfr::tan(x); }, 0, M_PI);
+
+    generate_test(cint<1>, "asin", [](const num& x) { return mpfr::asin(x); }, 0, 1);
+    generate_test(cint<1>, "acos", [](const num& x) { return mpfr::acos(x); }, 0, 1);
+    generate_test(cint<1>, "atan", [](const num& x) { return mpfr::atan(x); }, 0, 1);
+    generate_test(cint<2>, "atan2", [](const num& x, const num& y) { return mpfr::atan2(x, y); }, 0, 10);
+
+    generate_test(cint<1>, "sinh", [](const num& x) { return mpfr::sinh(x); }, 0, 10 * 2);
+    generate_test(cint<1>, "cosh", [](const num& x) { return mpfr::cosh(x); }, 0, 10 * 2);
+    generate_test(cint<1>, "tanh", [](const num& x) { return mpfr::tanh(x); }, 0, 10 * 2);
+    generate_test(cint<1>, "coth", [](const num& x) { return mpfr::coth(x); }, 0, 10 * 2);
+
+    generate_test(cint<1>, "gamma", [](const num& x) { return mpfr::gamma(x); }, 0, 10);
+
+    generate_test(cint<1>, "log", [](const num& x) { return mpfr::log(x); }, 0, 100);
+    generate_test(cint<1>, "log2", [](const num& x) { return mpfr::log2(x); }, 0, 100);
+    generate_test(cint<1>, "log10", [](const num& x) { return mpfr::log10(x); }, 0, 100);
+
+    generate_test(cint<1>, "exp", [](const num& x) { return mpfr::exp(x); }, -10, 10);
+    generate_test(cint<1>, "exp2", [](const num& x) { return mpfr::exp2(x); }, -10, 10);
+    generate_test(cint<1>, "exp10", [](const num& x) { return mpfr::exp10(x); }, -10, 10);
+
+    generate_test(cint<1>, "cbrt", [](const num& x) { return mpfr::cbrt(x); }, 0, 1000);
+}
diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp
@@ -7,44 +7,12 @@
 #include <kfr/testo/testo.hpp>
 
 #include <kfr/base.hpp>
-#include <kfr/dsp.hpp>
 #include <kfr/io.hpp>
 
 using namespace kfr;
 
-constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64, i8x3, //
-                   i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x3, //
-                   i32x1, i32x2, i32x4, i32x8, i32x16, i32x3 //
-#ifdef KFR_NATIVE_I64
-                   ,
-                   i64x1, i64x2, i64x4, i64x8, i64x16, i64x3 //
-#endif
-                   >
-    signed_types{};
-
-constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64, u8x3, //
-                   u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x3, //
-                   u32x1, u32x2, u32x4, u32x8, u32x16, u32x3 //
-#ifdef KFR_NATIVE_I64
-                   ,
-                   u64x1, u64x2, u64x4, u64x8, u64x16, u64x3 //
-#endif
-                   >
-    unsigned_types{};
-
-constexpr ctypes_t<f32x1, f32x2, f32x4, f32x8, f32x16, f32x3 //
-#ifdef KFR_NATIVE_F64
-                   ,
-                   f64x1, f64x2, f64x4, f64x8, f64x16, f64x3 //
-#endif
-                   >
-    float_types{};
-
-template <typename T>
-inline T ref_abs(T x)
+namespace CMT_ARCH_NAME
 {
-    return x >= T(0) ? x : -x;
-}
 
 template <typename T>
 bool builtin_add_overflow(T x, T y, T* r)
@@ -127,43 +95,6 @@ inline T ref_satsub(T x, T y)
         return result;
 }
 
-TEST(intrin_select)
-{
-    testo::matrix(named("type") = cconcat(signed_types, cconcat(unsigned_types, float_types)), [](auto type) {
-        using Tvec = type_of<decltype(type)>;
-        using T    = subtype<Tvec>;
-        CHECK(kfr::select(make_mask<T>(false), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(2));
-        CHECK(kfr::select(make_mask<T>(true), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(1));
-    });
-}
-
-TEST(intrin_abs)
-{
-    testo::assert_is_same<decltype(kfr::abs(1)), int>();
-    testo::assert_is_same<decltype(kfr::abs(1u)), unsigned int>();
-    testo::assert_is_same<decltype(kfr::abs(make_vector(1))), i32x1>();
-    testo::assert_is_same<decltype(kfr::abs(make_vector(1, 2))), i32x2>();
-    CHECK(kfr::abs(9u) == 9u);
-    CHECK(kfr::abs(9) == 9);
-    CHECK(kfr::abs(-9) == 9);
-    CHECK(kfr::abs(-infinity) == infinity);
-    CHECK(kfr::abs(make_vector(9)) == make_vector(9));
-    CHECK(kfr::abs(make_vector(-9)) == make_vector(9));
-
-    testo::matrix(named("type") = signed_types, named("value") = std::vector<int>{ -1, 0, +1 },
-                  [](auto type, int value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x));
-                  });
-    testo::matrix(named("type") = float_types, named("value") = std::vector<int>{ -1, 0, +1 },
-                  [](auto type, int value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x));
-                  });
-}
-
 TEST(intrin_sqrt)
 {
     testo::assert_is_same<decltype(kfr::sqrt(9)), fbase>();
@@ -175,141 +106,45 @@ TEST(intrin_sqrt)
     CHECK(kfr::sqrt(-9) == fbase(qnan));
     CHECK(kfr::sqrt(make_vector(9)) == make_vector<fbase>(3.0));
     CHECK(kfr::sqrt(make_vector(-9)) == make_vector<fbase>(qnan));
-    testo::matrix(named("type") = float_types, named("value") = std::vector<int>{ 0, 2, 65536 },
+    testo::matrix(named("type") = float_vector_types<vec>, named("value") = std::vector<int>{ 0, 2, 65536 },
                   [](auto type, int value) {
                       using T = type_of<decltype(type)>;
                       const T x(value);
-                      CHECK(kfr::sqrt(x) == apply([](auto x) { return std::sqrt(x); }, x));
-                  });
-}
-
-TEST(intrin_round)
-{
-    testo::assert_is_same<decltype(kfr::floor(100)), int>();
-    testo::assert_is_same<decltype(kfr::ceil(100)), int>();
-    testo::assert_is_same<decltype(kfr::round(100)), int>();
-    testo::assert_is_same<decltype(kfr::trunc(100)), int>();
-    testo::assert_is_same<decltype(kfr::fract(100)), int>();
-
-    testo::assert_is_same<decltype(kfr::ifloor(100.f)), int>();
-    testo::assert_is_same<decltype(kfr::iceil(100.f)), int>();
-    testo::assert_is_same<decltype(kfr::iround(100.f)), int>();
-    testo::assert_is_same<decltype(kfr::itrunc(100.f)), int>();
-    CHECK(kfr::floor(100) == 100);
-    CHECK(kfr::ceil(100) == 100);
-    CHECK(kfr::round(100) == 100);
-    CHECK(kfr::trunc(100) == 100);
-    CHECK(kfr::fract(100) == 0);
-
-    testo::matrix(named("type")  = float_types,
-                  named("value") = std::vector<fbase>{ -1.51, -1.49, 0.0, +1.49, +1.51 },
-                  [](auto type, fbase value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(kfr::floor(x) == apply([](auto x) { return std::floor(x); }, x));
-                      CHECK(kfr::ceil(x) == apply([](auto x) { return std::ceil(x); }, x));
-                      CHECK(kfr::round(x) == apply([](auto x) { return std::round(x); }, x));
-                      CHECK(kfr::trunc(x) == apply([](auto x) { return std::trunc(x); }, x));
-                      CHECK(kfr::fract(x) == apply([](auto x) { return x - std::floor(x); }, x));
-                  });
-}
-
-TEST(intrin_min_max)
-{
-    testo::assert_is_same<decltype(min(1, 2)), int>();
-    testo::assert_is_same<decltype(min(1, 2u)), unsigned int>();
-    testo::assert_is_same<decltype(min(1, 2)), int>();
-    testo::assert_is_same<decltype(min(pack(1), 2u)), u32x1>();
-    testo::assert_is_same<decltype(min(2u, pack(1))), u32x1>();
-    testo::assert_is_same<decltype(min(pack(1), pack(2u))), u32x1>();
-    testo::assert_is_same<decltype(min(pack(1, 2, 3), pack(1.0, 2.0, 3.0))), f64x3>();
-    testo::assert_is_same<decltype(min(pack(1.0, 2.0, 3.0), pack(1, 2, 3))), f64x3>();
-
-    CHECK(min(1, 2) == 1);
-    CHECK(min(1, 2u) == 1u);
-    CHECK(min(pack(1), 2) == pack(1));
-    CHECK(min(pack(1, 2, 3), 2) == pack(1, 2, 2));
-    CHECK(min(pack(1., 2., 3.), 2) == pack(1., 2., 2.));
-
-    testo::matrix(named("type")  = float_types,
-                  named("value") = std::vector<std::pair<fbase, fbase>>{ { -100, +100 }, { infinity, 0.0 } },
-                  [](auto type, std::pair<fbase, fbase> value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value.first);
-                      const T y(value.second);
-                      CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y));
-                      CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y));
-                      CHECK(kfr::absmin(x, y) ==
-                            apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y));
-                      CHECK(kfr::absmax(x, y) ==
-                            apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y));
-                  });
-    testo::matrix(named("type")  = signed_types,
-                  named("value") = std::vector<std::pair<int, int>>{ { -100, +100 } },
-                  [](auto type, std::pair<int, int> value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value.first);
-                      const T y(value.second);
-                      CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y));
-                      CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y));
-                      CHECK(kfr::absmin(x, y) ==
-                            apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y));
-                      CHECK(kfr::absmax(x, y) ==
-                            apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y));
-                  });
-    testo::matrix(named("type")  = unsigned_types,
-                  named("value") = std::vector<std::pair<unsigned, unsigned>>{ { 0, +200 } },
-                  [](auto type, std::pair<unsigned, unsigned> value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value.first);
-                      const T y(value.second);
-                      CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y));
-                      CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y));
-                      CHECK(kfr::absmin(x, y) ==
-                            apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y));
-                      CHECK(kfr::absmax(x, y) ==
-                            apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y));
+                      CHECK(kfr::sqrt(x) == apply([](auto x) -> decltype(x) { return std::sqrt(x); }, x));
                   });
 }
 
 TEST(intrin_satadd_satsub)
 {
-    testo::matrix(named("type") = signed_types, [](auto type) {
-        using T     = type_of<decltype(type)>;
-        using Tsub  = subtype<T>;
-        const T min = std::numeric_limits<Tsub>::min();
-        const T max = std::numeric_limits<Tsub>::max();
-        CHECK(kfr::satadd(min, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, min));
-        CHECK(kfr::satadd(max, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, max));
-        CHECK(kfr::satadd(min, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, max));
-        CHECK(kfr::satadd(max, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, min));
-
-        CHECK(kfr::satsub(min, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, min));
-        CHECK(kfr::satsub(max, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, max));
-        CHECK(kfr::satsub(min, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, max));
-        CHECK(kfr::satsub(max, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, min));
-    });
-
-    testo::matrix(named("type") = unsigned_types, [](auto type) {
-        using T      = type_of<decltype(type)>;
-        using Tsub   = subtype<T>;
-        const T& min = std::numeric_limits<Tsub>::min();
-        const T& max = std::numeric_limits<Tsub>::max();
-        CHECK(kfr::satadd(min, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, min));
-        CHECK(kfr::satadd(max, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, max));
-        CHECK(kfr::satadd(min, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, max));
-        CHECK(kfr::satadd(max, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, min));
-
-        CHECK(kfr::satsub(min, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, min));
-        CHECK(kfr::satsub(max, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, max));
-        CHECK(kfr::satsub(min, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, max));
-        CHECK(kfr::satsub(max, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, min));
-    });
+    testo::matrix(named("type") = cconcat(signed_vector_types<vec>, unsigned_vector_types<vec>),
+                  [](auto type) {
+                      using T     = type_of<decltype(type)>;
+                      using Tsub  = subtype<T>;
+                      const T min = std::numeric_limits<Tsub>::min();
+                      const T max = std::numeric_limits<Tsub>::max();
+                      CHECK(kfr::satadd(min, min) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, min, min));
+                      CHECK(kfr::satadd(max, max) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, max, max));
+                      CHECK(kfr::satadd(min, max) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, min, max));
+                      CHECK(kfr::satadd(max, min) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, max, min));
+
+                      CHECK(kfr::satsub(min, min) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, min, min));
+                      CHECK(kfr::satsub(max, max) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, max, max));
+                      CHECK(kfr::satsub(min, max) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, min, max));
+                      CHECK(kfr::satsub(max, min) ==
+                            apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, max, min));
+                  });
 }
 
 TEST(intrin_any_all)
 {
-    testo::matrix(named("type") = unsigned_types, [](auto type) {
+    testo::matrix(named("type") = unsigned_vector_types<vec>, [](auto type) {
         using T                = type_of<decltype(type)>;
         constexpr size_t width = widthof<T>();
         using Tsub             = subtype<T>;
@@ -328,74 +163,7 @@ TEST(intrin_any_all)
     });
 }
 
-TEST(intrin_math)
-{
-    testo::assert_is_same<decltype(pack(11) * pack(0.5)), f64x1>();
-    testo::assert_is_same<decltype(pack(11) * 0.5), f64x1>();
-    testo::assert_is_same<decltype(kfr::sin(2)), fbase>();
-    testo::assert_is_same<decltype(kfr::sin(pack(2))), vec<fbase, 1>>();
-    testo::assert_is_same<decltype(kfr::sindeg(2)), fbase>();
-    testo::assert_is_same<decltype(kfr::sindeg(pack(2))), vec<fbase, 1>>();
-
-    CHECK(pack(11) * pack(0.5) == 5.5);
-    CHECK(pack(11) * 0.5 == 5.5);
-    CHECK(kfr::sin(2) == fbase(0.90929742682568169539601986591174));
-    CHECK(kfr::sin(pack(2)) == pack(fbase(0.90929742682568169539601986591174)));
-    CHECK(kfr::sindeg(2) == fbase(0.03489949670250097164599518162533));
-    CHECK(kfr::sindeg(pack(2)) == pack(fbase(0.03489949670250097164599518162533)));
-    CHECK(kfr::cos(2) == fbase(-0.41614683654714238699756822950076));
-    CHECK(kfr::cos(pack(2)) == pack(fbase(-0.41614683654714238699756822950076)));
-    CHECK(kfr::cosdeg(2) == fbase(0.99939082701909573000624344004393));
-    CHECK(kfr::cosdeg(pack(2)) == pack(fbase(0.99939082701909573000624344004393)));
-
-    CHECK(kfr::log(2) == fbase(0.6931471805599453));
-    CHECK(kfr::log(pack(2)) == pack(fbase(0.6931471805599453)));
-    CHECK(kfr::log2(2) == fbase(1.0));
-    CHECK(kfr::log2(pack(2)) == pack(fbase(1.0)));
-    CHECK(kfr::log10(2) == fbase(0.30102999566398119521373889472449));
-    CHECK(kfr::log10(pack(2)) == pack(fbase(0.30102999566398119521373889472449)));
-
-    CHECK(kfr::exp(2) == fbase(7.3890560989306502));
-    CHECK(kfr::exp(pack(2)) == pack(fbase(7.3890560989306502)));
-    CHECK(kfr::exp2(2) == fbase(4.0));
-    CHECK(kfr::exp2(pack(2)) == pack(fbase(4.0)));
-
-    CHECK(kfr::logn(2, 10) == fbase(0.30102999566398119521373889472449));
-    CHECK(kfr::logn(pack(2), pack(10)) == pack(fbase(0.30102999566398119521373889472449)));
-
-    CHECK(kfr::pow(2, fbase(0.9)) == fbase(1.8660659830736148319626865322999));
-    CHECK(kfr::pow(pack(2), pack(fbase(0.9))) == pack(fbase(1.8660659830736148319626865322999)));
-
-    CHECK(kfr::root(fbase(1.5), 2) == fbase(1.2247448713915890490986420373529));
-    CHECK(kfr::root(pack(fbase(1.5)), pack(2)) == pack(fbase(1.2247448713915890490986420373529)));
-
-    testo::epsilon<float>() *= 10.0;
-    testo::epsilon<double>() *= 10.0;
-
-    CHECK(kfr::sinh(2) == fbase(3.6268604078470187676682139828013));
-    CHECK(kfr::sinh(pack(2)) == pack(fbase(3.6268604078470187676682139828013)));
-    CHECK(kfr::cosh(2) == fbase(3.7621956910836314595622134777737));
-    CHECK(kfr::cosh(pack(2)) == pack(fbase(3.7621956910836314595622134777737)));
-
-    CHECK(kfr::tanh(2) == fbase(0.96402758007581688394641372410092));
-    CHECK(kfr::tanh(pack(2)) == pack(fbase(0.96402758007581688394641372410092)));
-    CHECK(kfr::coth(2) == fbase(1.0373147207275480958778097647678));
-    CHECK(kfr::coth(pack(2)) == pack(fbase(1.0373147207275480958778097647678)));
-
-    testo::epsilon<float>() *= 10.0;
-    testo::epsilon<double>() *= 10.0;
-
-    CHECK(kfr::tan(2) == fbase(-2.1850398632615189916433061023137));
-    CHECK(kfr::tan(pack(2)) == pack(fbase(-2.1850398632615189916433061023137)));
-    CHECK(kfr::tandeg(2) == fbase(0.03492076949174773050040262577373));
-    CHECK(kfr::tandeg(pack(2)) == pack(fbase(0.03492076949174773050040262577373)));
-
-    testo::epsilon<float>() *= 10.0;
-    testo::epsilon<double>() *= 10.0;
-
-    CHECK(kfr::note_to_hertz(60) == fbase(261.6255653005986346778499935233));
-    CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233)));
-}
+} // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
 int main()
diff --git a/tests/io_test.cpp b/tests/io_test.cpp
@@ -8,11 +8,13 @@
 
 #include <kfr/base.hpp>
 #include <kfr/cometa/function.hpp>
-#include <kfr/dsp.hpp>
 #include <kfr/io.hpp>
 
 using namespace kfr;
 
+namespace CMT_ARCH_NAME
+{
+
 #if KFR_ENABLE_WAV
 TEST(write_wav_file)
 {
@@ -22,17 +24,17 @@ TEST(write_wav_file)
     data      = sin(counter() * 0.01f);
     size_t wr = writer.write(data.data(), data.size());
     CHECK(wr == data.size());
-    CHECK(writer.format().length == data.size() / 2);
+    CHECK(umax(writer.format().length) == data.size() / 2);
 }
 
 TEST(read_wav_file)
 {
     audio_reader_wav<float> reader(open_file_for_reading(KFR_FILEPATH("temp_audio_file.wav")));
-    CHECK(reader.format().channels == 2);
+    CHECK(reader.format().channels == 2u);
     CHECK(reader.format().type == audio_sample_type::i16);
     CHECK(reader.format().samplerate == 44100);
     univector<float> data(44100 * 2);
-    CHECK(reader.format().length == data.size() / 2);
+    CHECK(umax(reader.format().length) == data.size() / 2);
     size_t rd = reader.read(data.data(), data.size());
     CHECK(rd == data.size());
     CHECK(absmaxof(data - render(sin(counter() * 0.01f), data.size())) < 0.0001f);
@@ -40,10 +42,10 @@ TEST(read_wav_file)
 #endif
 
 #if KFR_ENABLE_FLAC
-TEST(read_flac_file)
+DTEST(read_flac_file)
 {
     audio_reader_flac<float> reader(open_file_for_reading(KFR_FILEPATH("../../tests/test-audio/sine.flac")));
-    CHECK(reader.format().channels == 2);
+    CHECK(reader.format().channels == 2u);
     CHECK(reader.format().type == audio_sample_type::i32);
     CHECK(reader.format().samplerate == 44100);
     univector<float> data(44100 * 2);
@@ -53,6 +55,7 @@ TEST(read_flac_file)
     CHECK(absmaxof(data - render(sin(counter() * 0.01f), data.size())) < 0.0001f);
 }
 #endif
+} // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
 int main()
diff --git a/tests/mpfr/mpfrplus.hpp b/tests/mpfr/mpfrplus.hpp
@@ -18,6 +18,7 @@ MPFR_DIAG_PRAGMA(ignored "-Wsign-conversion")
 MPFR_DIAG_PRAGMA(pop)
 #include <cmath>
 #include <limits>
+#include <string>
 #include <type_traits>
 
 namespace mpfr
@@ -47,17 +48,14 @@ constexpr with_precision_t with_precision{};
 
 namespace internal
 {
-#ifndef MPFR_THREAD_LOCAL
-#define MPFR_THREAD_LOCAL thread_local
-#endif
-static mpfr_prec_t& precision()
+inline mpfr_prec_t& precision()
 {
-    static MPFR_THREAD_LOCAL mpfr_prec_t prec = mpfr_get_default_prec();
+    static mpfr_prec_t prec = mpfr_get_default_prec();
     return prec;
 }
-static mpfr_rnd_t& rounding_mode()
+inline mpfr_rnd_t& rounding_mode()
 {
-    static MPFR_THREAD_LOCAL mpfr_rnd_t rnd = mpfr_get_default_rounding_mode();
+    static mpfr_rnd_t rnd = mpfr_get_default_rounding_mode();
     return rnd;
 }
 } // namespace internal
@@ -241,7 +239,7 @@ public:
     MPFR_CXX_CTOR_T(mpfr_set_ui, unsigned int)
     MPFR_CXX_CTOR_T(mpfr_set_si, long int)
     MPFR_CXX_CTOR_T(mpfr_set_ui, unsigned long int)
-#if __INTMAX_MAX__ != __LONG_MAX__
+#ifdef _MPFR_H_HAVE_INTMAX_T
     MPFR_CXX_CTOR_T(mpfr_set_sj, intmax_t)
     MPFR_CXX_CTOR_T(mpfr_set_uj, uintmax_t)
 #endif
@@ -253,7 +251,7 @@ public:
     MPFR_CXX_ASGN_T(mpfr_set_ui, unsigned int)
     MPFR_CXX_ASGN_T(mpfr_set_si, long int)
     MPFR_CXX_ASGN_T(mpfr_set_ui, unsigned long int)
-#if __INTMAX_MAX__ != __LONG_MAX__
+#ifdef _MPFR_H_HAVE_INTMAX_T
     MPFR_CXX_ASGN_T(mpfr_set_sj, intmax_t)
     MPFR_CXX_ASGN_T(mpfr_set_uj, uintmax_t)
 #endif
@@ -300,6 +298,15 @@ public:
     {
         return mpfr_get_ld(val, internal::rounding_mode());
     }
+
+    std::string to_string() const
+    {
+        char* str;
+        mpfr_asprintf(&str, "%.*Rg", prec(), val);
+        std::string result = str;
+        mpfr_free_str(str);
+        return result;
+    }
 };
 
 #ifdef MPFR_USE_UDL
diff --git a/tests/multiarch.cpp b/tests/multiarch.cpp
@@ -7,7 +7,6 @@
 #include <kfr/testo/testo.hpp>
 
 #include <kfr/base.hpp>
-#include <kfr/cpuid.hpp>
 #include <kfr/dsp.hpp>
 #include <kfr/io.hpp>
 
diff --git a/tests/numeric_tests.hpp b/tests/numeric_tests.hpp
@@ -0,0 +1,123 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/io.hpp>
+#include <kfr/testo/testo.hpp>
+
+namespace kfr
+{
+
+using testo::test_data_entry;
+
+inline namespace CMT_ARCH_NAME
+{
+
+using vector_types =
+    ctypes_t<f32, f32x1, f32x2, f32x4, f32x8, f32x16, f32x32, f64, f64x1, f64x2, f64x4, f64x8, f64x16>;
+
+template <typename T>
+uint64_t ulps(T x, T y)
+{
+    if (std::abs(x) < std::numeric_limits<T>::min() && std::abs(y) < std::numeric_limits<T>::min())
+        return 0;
+    if (std::isnan(x) && std::isnan(y))
+        return 0;
+    if (std::isinf(x) && std::isinf(y))
+        return (x < 0) == (y < 0) ? 0 : ULLONG_MAX;
+    if (x < 0 && y < 0)
+        return ulps<T>(-x, -y);
+    if ((x < 0) != (y < 0))
+        return ulps<T>(std::abs(x), 0) + ulps<T>(std::abs(y), 0);
+
+    utype<T> ix = cometa::bitcast_anything<utype<T>>(x);
+    utype<T> iy = cometa::bitcast_anything<utype<T>>(y);
+    if (std::abs(x) < std::numeric_limits<T>::min() && y > std::numeric_limits<T>::min())
+        return 1 + ulps<T>(std::numeric_limits<T>::min(), y);
+    if (std::abs(x) > std::numeric_limits<T>::min() && y < std::numeric_limits<T>::min())
+        return 1 + ulps<T>(x, std::numeric_limits<T>::min());
+    return ix > iy ? ix - iy : iy - ix;
+}
+
+template <typename T, size_t N>
+uint64_t ulps(vec<T, N> x, vec<T, N> y)
+{
+    uint64_t u = 0;
+    for (size_t i = 0; i < N; i++)
+    {
+        u = std::max(u, ulps(x[i], y[i]));
+    }
+    return u;
+}
+
+inline const char* tname(ctype_t<f32>) { return "float"; }
+inline const char* tname(ctype_t<f64>) { return "double"; }
+
+#define CHECK_DIFF(x_arg, y_arg, threshold)                                                                  \
+    do                                                                                                       \
+    {                                                                                                        \
+        ++checks_count;                                                                                      \
+        const auto x_arg_value = x_arg;                                                                      \
+        const auto y_arg_value = y_arg;                                                                      \
+        const auto arg_diff    = ulps(x_arg_value, y_arg_value);                                             \
+        error_sum += arg_diff;                                                                               \
+        error_peak = std::max(error_peak, arg_diff);                                                         \
+        ::testo::active_test()->check(                                                                       \
+            arg_diff <= threshold,                                                                           \
+            ::cometa::as_string(x_arg_value, " ~= ", y_arg_value, " (", arg_diff, " <= ", threshold, ")"),   \
+            #x_arg " ~= " #y_arg);                                                                           \
+    } while (0)
+
+#define KFR_AUTO_TEST_1(fn, datafile, maxulps, avgulps)                                                      \
+    TEST(fn##_##datafile)                                                                                    \
+    {                                                                                                        \
+        testo::matrix(named("type") = vector_types(), [&](auto type) {                                       \
+            using T               = type_of<decltype(type)>;                                                 \
+            using Tsub            = subtype<T>;                                                              \
+            double error_sum      = 0.0;                                                                     \
+            uint64_t error_peak   = 0;                                                                       \
+            uint64_t checks_count = 0;                                                                       \
+            std::shared_ptr<file_reader<test_data_entry<Tsub, 1>>> reader =                                  \
+                open_file_for_reading<test_data_entry<Tsub, 1>>(                                             \
+                    std::string(KFR_SRC_DIR "/tests/data/" #fn "_") + tname(ctype<Tsub>) + "_" #datafile);   \
+            test_data_entry<Tsub, 1> entry;                                                                  \
+            while (reader->read(entry))                                                                      \
+            {                                                                                                \
+                testo::scope s(as_string(entry.arguments[0]));                                               \
+                CHECK_DIFF(kfr::fn(entry.arguments[0]), entry.result, maxulps);                              \
+            }                                                                                                \
+            CHECK(checks_count > 0u);                                                                        \
+            CHECK(error_sum / checks_count <= avgulps);                                                      \
+            println("measured accuracy: ", tname(ctype<Tsub>), " ", error_sum / checks_count, "(peak ",      \
+                    error_peak, ")");                                                                        \
+        });                                                                                                  \
+    }
+
+#define KFR_AUTO_TEST_2(fn, datafile, maxulps, avgulps)                                                      \
+    TEST(fn##_##datafile)                                                                                    \
+    {                                                                                                        \
+        testo::matrix(named("type") = vector_types(), [&](auto type) {                                       \
+            using T               = type_of<decltype(type)>;                                                 \
+            using Tsub            = subtype<T>;                                                              \
+            double error_sum      = 0.0;                                                                     \
+            uint64_t error_peak   = 0;                                                                       \
+            uint64_t checks_count = 0;                                                                       \
+            std::shared_ptr<file_reader<test_data_entry<Tsub, 2>>> reader =                                  \
+                open_file_for_reading<test_data_entry<Tsub, 2>>(                                             \
+                    std::string(KFR_SRC_DIR "/tests/data/" #fn "_") + tname(ctype<Tsub>) + "_" #datafile);   \
+            test_data_entry<Tsub, 2> entry;                                                                  \
+            while (reader->read(entry))                                                                      \
+            {                                                                                                \
+                testo::scope s(as_string(entry.arguments[0], entry.arguments[1]));                           \
+                CHECK_DIFF(kfr::fn(entry.arguments[0], entry.arguments[1]), entry.result, maxulps);          \
+            }                                                                                                \
+            CHECK(checks_count > 0u);                                                                        \
+            CHECK(error_sum / checks_count <= avgulps);                                                      \
+            println("measured accuracy: ", tname(ctype<Tsub>), " ", error_sum / checks_count, "(peak ",      \
+                    error_peak, ")");                                                                        \
+        });                                                                                                  \
+    }
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/resampler_test.cpp b/tests/resampler_test.cpp
@@ -1,37 +0,0 @@
-/**
- * KFR (http://kfrlib.com)
- * Copyright (C) 2016  D Levin
- * See LICENSE.txt for details
- */
-
-#include <kfr/dsp.hpp>
-#include <kfr/io.hpp>
-#include <kfr/testo/testo.hpp>
-
-using namespace kfr;
-
-TEST(resampler_test)
-{
-    const int in_sr  = 44100;
-    const int out_sr = 48000;
-    const int freq   = 100;
-    auto resampler   = sample_rate_converter<fbase>(resample_quality::draft, out_sr, in_sr);
-    double delay     = resampler.get_fractional_delay();
-    univector<fbase> out(out_sr / 10);
-    univector<fbase> in  = truncate(sin(c_pi<fbase> * phasor<fbase>(freq, in_sr, 0)), in_sr / 10);
-    univector<fbase> ref = truncate(
-        sin(c_pi<fbase> * phasor<fbase>(freq, out_sr, -delay * (static_cast<double>(freq) / out_sr))),
-        out_sr / 10);
-    resampler.process(out, in);
-
-    CHECK(rms(slice(out - ref, ceil(delay * 2))) < 0.005f);
-}
-
-#ifndef KFR_NO_MAIN
-int main()
-{
-    println(library_version());
-
-    return testo::run_all("", true);
-}
-#endif
diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp
@@ -1,172 +0,0 @@
-/**
- * KFR (http://kfrlib.com)
- * Copyright (C) 2016  D Levin
- * See LICENSE.txt for details
- */
-
-#include <kfr/testo/testo.hpp>
-
-#include <kfr/base.hpp>
-#include <kfr/io.hpp>
-
-#define MPFR_THREAD_LOCAL
-#include "mpfr/mpfrplus.hpp"
-
-using namespace kfr;
-
-using vector_types = ctypes_t<f32, f64, f32x2, f32x8, f32x16, f64x2, f64x4, f64x8>;
-
-template <typename T>
-double ulps(T test, const mpfr::number& ref)
-{
-    if (std::isnan(test) && ref.isnan())
-        return 0;
-    if (std::isinf(test) && ref.isinfinity())
-        return (test < 0) == (ref < 0) ? 0 : NAN;
-    return static_cast<double>(mpfr::abs(mpfr::number(test) - ref) /
-                               mpfr::abs(mpfr::number(test) - std::nexttoward(test, HUGE_VALL)));
-}
-
-template <typename T, size_t N>
-double ulps(const vec<T, N>& test, const mpfr::number& ref)
-{
-    double u = 0;
-    for (size_t i = 0; i < N; ++i)
-        u = std::max(u, ulps(test[i], ref));
-    return u;
-}
-
-TEST(test_sin_cos)
-{
-    testo::matrix(named("type")  = vector_types(),
-                  named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
-                      CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
-                  });
-    testo::matrix(named("type") = vector_types(), named("value") = make_range(-100.0, 100.0, 0.5),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
-                      CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
-                  });
-}
-
-TEST(test_tan)
-{
-    testo::matrix(named("type")  = ctypes_t<f32>(),
-                  named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.01),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 2.0);
-                  });
-    testo::matrix(named("type") = ctypes_t<f32>(), named("value") = make_range(-100.0, 100.0, 0.5),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 3.0);
-                  });
-}
-
-#ifdef __clang__
-#define ARCFN_ULP 2.0
-#else
-#define ARCFN_ULP 2.5
-#endif
-
-TEST(test_asin_acos_atan)
-{
-    testo::matrix(named("type") = vector_types(), named("value") = make_range(-1.0, 1.0, 0.05),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::asin(x), mpfr::asin(subtype<T>(value))) < ARCFN_ULP);
-                      CHECK(ulps(kfr::acos(x), mpfr::acos(subtype<T>(value))) < ARCFN_ULP);
-                      CHECK(ulps(kfr::atan(x), mpfr::atan(subtype<T>(value))) < ARCFN_ULP);
-                  });
-}
-
-TEST(test_atan2)
-{
-    testo::matrix(named("type") = vector_types(), named("value1") = make_range(-1.0, 1.0, 0.1),
-                  named("value2") = make_range(-1.0, 1.0, 0.1), [](auto type, double value1, double value2) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value1);
-                      const T y(value2);
-                      CHECK(ulps(kfr::atan2(x, y), mpfr::atan2(subtype<T>(value1), subtype<T>(value2))) <
-                            ARCFN_ULP);
-                  });
-}
-
-TEST(test_log)
-{
-    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::log(x), mpfr::log(x)) < 2.0);
-                  });
-}
-
-TEST(test_log2)
-{
-    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::log2(x), mpfr::log2(x)) < 3.0);
-                  });
-}
-
-TEST(test_log10)
-{
-    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::log10(x), mpfr::log10(x)) < 3.0);
-                  });
-}
-
-TEST(test_exp)
-{
-    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::exp(x), mpfr::exp(x)) < 2.0);
-                  });
-}
-
-TEST(test_exp2)
-{
-    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::exp2(x), mpfr::exp2(x)) < 3.0);
-                  });
-}
-
-TEST(test_exp10)
-{
-    testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
-                  [](auto type, double value) {
-                      using T = type_of<decltype(type)>;
-                      const T x(value);
-                      CHECK(ulps(kfr::exp10(x), mpfr::exp10(x)) < 3.0);
-                  });
-}
-
-#ifndef KFR_NO_MAIN
-int main()
-{
-    println(library_version(), " running on ", cpu_runtime());
-    mpfr::scoped_precision p(128);
-    return testo::run_all("");
-}
-#endif
diff --git a/tests/unit/base/conversion.cpp b/tests/unit/base/conversion.cpp
@@ -0,0 +1,67 @@
+#include <kfr/base/conversion.hpp>
+
+#include <kfr/base/basic_expressions.hpp>
+
+#include <kfr/base/reduce.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+TEST(sample_conversion)
+{
+    CHECK(convert_sample<float>(static_cast<i8>(-127)) == -1.f);
+    CHECK(convert_sample<float>(static_cast<i8>(0)) == 0.f);
+    CHECK(convert_sample<float>(static_cast<i8>(127)) == 1.f);
+
+    CHECK(convert_sample<float>(static_cast<i16>(-32767)) == -1.f);
+    CHECK(convert_sample<float>(static_cast<i16>(0)) == 0.f);
+    CHECK(convert_sample<float>(static_cast<i16>(32767)) == 1.f);
+
+    CHECK(convert_sample<float>(static_cast<i24>(-8388607)) == -1.f);
+    CHECK(convert_sample<float>(static_cast<i24>(0)) == 0.f);
+    CHECK(convert_sample<float>(static_cast<i24>(8388607)) == 1.f);
+
+    CHECK(convert_sample<float>(static_cast<i32>(-2147483647)) == -1.f);
+    CHECK(convert_sample<float>(static_cast<i32>(0)) == 0.f);
+    CHECK(convert_sample<float>(static_cast<i32>(2147483647)) == 1.f);
+
+    CHECK(convert_sample<i8>(-1.f) == -127);
+    CHECK(convert_sample<i8>(0.f) == 0);
+    CHECK(convert_sample<i8>(1.f) == 127);
+
+    CHECK(convert_sample<i16>(-1.f) == -32767);
+    CHECK(convert_sample<i16>(0.f) == 0);
+    CHECK(convert_sample<i16>(1.f) == 32767);
+
+    CHECK(convert_sample<i24>(-1.f) == -8388607);
+    CHECK(convert_sample<i24>(0.f) == 0);
+    CHECK(convert_sample<i24>(1.f) == 8388607);
+
+    CHECK(convert_sample<i32>(-1.f) == -2147483647);
+    CHECK(convert_sample<i32>(0.f) == 0);
+    CHECK(convert_sample<i32>(1.f) == 2147483647);
+}
+
+TEST(sample_interleave_deinterleave)
+{
+    const size_t size = 50;
+    univector2d<float> in;
+    in.push_back(truncate(counter() * 3.f + 0.f, size));
+    in.push_back(truncate(counter() * 3.f + 1.f, size));
+    in.push_back(truncate(counter() * 3.f + 2.f, size));
+    univector<float> out(size * 3);
+    interleave(out.data(), std::array<const float*, 3>{ in[0].data(), in[1].data(), in[2].data() }.data(), 3,
+               size);
+    CHECK(maxof(out - render(counter() * 1.f, out.size())) == 0);
+
+    deinterleave(std::array<float*, 3>{ in[0].data(), in[1].data(), in[2].data() }.data(), out.data(), 3,
+                 size);
+
+    CHECK(absmaxof(in[0] - render(counter() * 3.f + 0.f, size)) == 0);
+    CHECK(absmaxof(in[1] - render(counter() * 3.f + 1.f, size)) == 0);
+    CHECK(absmaxof(in[2] - render(counter() * 3.f + 2.f, size)) == 0);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/base/reduce.cpp b/tests/unit/base/reduce.cpp
@@ -0,0 +1,41 @@
+#include <kfr/base/reduce.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+TEST(reduce)
+{
+    {
+        univector<float, 5> a({ 1, 2, 3, 4, 5 });
+        CHECK(sum(a) == 15);
+        CHECK(mean(a) == 3);
+        CHECK(minof(a) == 1);
+        CHECK(maxof(a) == 5);
+        CHECK(sumsqr(a) == 55);
+        CHECK(rms(a) == 3.316624790355399849115f);
+        CHECK(product(a) == 120);
+    }
+    {
+        univector<double, 5> a({ 1, 2, 3, 4, 5 });
+        CHECK(sum(a) == 15);
+        CHECK(mean(a) == 3);
+        CHECK(minof(a) == 1);
+        CHECK(maxof(a) == 5);
+        CHECK(sumsqr(a) == 55);
+        CHECK(rms(a) == 3.316624790355399849115);
+        CHECK(product(a) == 120);
+    }
+    {
+        univector<int, 5> a({ 1, 2, 3, 4, 5 });
+        CHECK(sum(a) == 15);
+        CHECK(mean(a) == 3);
+        CHECK(minof(a) == 1);
+        CHECK(maxof(a) == 5);
+        CHECK(sumsqr(a) == 55);
+        CHECK(product(a) == 120);
+    }
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/abs.cpp b/tests/unit/math/abs.cpp
@@ -0,0 +1,13 @@
+#include <kfr/math/abs.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(abs)
+{
+    test_function1(test_catogories::all, [](auto x) { return kfr::abs(x); },
+                   [](auto x) -> decltype(x) { return x >= 0 ? x : -x; });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/asin_acos.cpp b/tests/unit/math/asin_acos.cpp
@@ -0,0 +1,18 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/asin_acos.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(asin, narrow, 6, 1)
+KFR_AUTO_TEST_1(acos, narrow, 800, 1)
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/tests/unit/math/atan.cpp b/tests/unit/math/atan.cpp
@@ -0,0 +1,18 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/atan.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(atan, narrow, 2, 1)
+KFR_AUTO_TEST_2(atan2, narrow, 2, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/hyperbolic.cpp b/tests/unit/math/hyperbolic.cpp
@@ -0,0 +1,21 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/hyperbolic.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(sinh, narrow, 114, 2.5)
+KFR_AUTO_TEST_1(cosh, narrow, 7, 2.5)
+KFR_AUTO_TEST_1(tanh, narrow, 45, 1)
+KFR_AUTO_TEST_1(coth, narrow, 85, 1)
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/tests/unit/math/log_exp.cpp b/tests/unit/math/log_exp.cpp
@@ -0,0 +1,23 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/log_exp.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(gamma, narrow, 2200, 321)
+KFR_AUTO_TEST_1(exp, narrow, 4, 2)
+KFR_AUTO_TEST_1(exp2, narrow, 5, 2)
+KFR_AUTO_TEST_1(exp10, narrow, 40, 10)
+KFR_AUTO_TEST_1(log, narrow, 2, 1)
+KFR_AUTO_TEST_1(log2, narrow, 2, 1)
+KFR_AUTO_TEST_1(log10, narrow, 3, 1)
+KFR_AUTO_TEST_1(cbrt, narrow, 5, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/min_max.cpp b/tests/unit/math/min_max.cpp
@@ -0,0 +1,39 @@
+#include <kfr/math/min_max.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(min)
+{
+    test_function2(test_catogories::all, [](auto x, auto y) { return kfr::min(x, y); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x <= y ? x : y; });
+}
+
+TEST(max)
+{
+    test_function2(test_catogories::all, [](auto x, auto y) { return kfr::max(x, y); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x >= y ? x : y; });
+}
+
+TEST(absmin)
+{
+    test_function2(test_catogories::all, [](auto x, auto y) { return kfr::absmin(x, y); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       x = x >= 0 ? x : -x;
+                       y = y >= 0 ? y : -y;
+                       return x <= y ? x : y;
+                   });
+}
+
+TEST(absmax)
+{
+    test_function2(test_catogories::all, [](auto x, auto y) { return kfr::absmax(x, y); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       x = x >= 0 ? x : -x;
+                       y = y >= 0 ? y : -y;
+                       return x >= y ? x : y;
+                   });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/round.cpp b/tests/unit/math/round.cpp
@@ -0,0 +1,53 @@
+#include <kfr/math/round.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(floor)
+{
+    test_function1(test_catogories::all, [](auto x) { return kfr::floor(x); },
+                   [](auto x) -> decltype(x) {
+                       return std::is_integral<decltype(x)>::value ? x
+                                                                   : static_cast<decltype(x)>(std::floor(x));
+                   });
+}
+
+TEST(ceil)
+{
+    test_function1(test_catogories::all, [](auto x) { return kfr::ceil(x); },
+                   [](auto x) -> decltype(x) {
+                       return std::is_integral<decltype(x)>::value ? x
+                                                                   : static_cast<decltype(x)>(std::ceil(x));
+                   });
+}
+
+TEST(trunc)
+{
+    test_function1(test_catogories::all, [](auto x) { return kfr::trunc(x); },
+                   [](auto x) -> decltype(x) {
+                       return std::is_integral<decltype(x)>::value ? x
+                                                                   : static_cast<decltype(x)>(std::trunc(x));
+                   });
+}
+
+TEST(round)
+{
+    test_function1(test_catogories::all, [](auto x) { return kfr::round(x); },
+                   [](auto x) -> decltype(x) {
+                       return std::is_integral<decltype(x)>::value ? x
+                                                                   : static_cast<decltype(x)>(std::round(x));
+                   });
+}
+
+TEST(fract)
+{
+    test_function1(test_catogories::all, [](auto x) { return kfr::fract(x); },
+                   [](auto x) -> decltype(x) {
+                       return std::is_integral<decltype(x)>::value
+                                  ? 0
+                                  : static_cast<decltype(x)>(x - std::floor(x));
+                   });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/select.cpp b/tests/unit/math/select.cpp
@@ -0,0 +1,27 @@
+#include <kfr/math/select.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(select_true)
+{
+    test_function2(test_catogories::vectors,
+                   [](auto x, auto y) {
+                       mask<subtype<decltype(x)>, decltype(x)::scalar_size()> m(true);
+                       return kfr::select(m, x, y);
+                   },
+                   [](auto x, auto) { return x; });
+}
+
+TEST(select_false)
+{
+    test_function2(test_catogories::vectors,
+                   [](auto x, auto y) {
+                       mask<subtype<decltype(x)>, decltype(x)::scalar_size()> m(false);
+                       return kfr::select(m, x, y);
+                   },
+                   [](auto, auto y) { return y; });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/sin_cos.cpp b/tests/unit/math/sin_cos.cpp
@@ -0,0 +1,17 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/sin_cos.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(sin, narrow, 7, 1)
+KFR_AUTO_TEST_1(cos, narrow, 2, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/tan.cpp b/tests/unit/math/tan.cpp
@@ -0,0 +1,16 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/tan.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(tan, narrow, 7, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/complex.cpp b/tests/unit/simd/complex.cpp
@@ -0,0 +1,33 @@
+#include <kfr/simd/complex.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(complex_convertible)
+{
+    static_assert(std::is_convertible<float, complex<float>>::value, "");
+    static_assert(std::is_convertible<float, complex<double>>::value, "");
+    static_assert(std::is_convertible<short, complex<double>>::value, "");
+
+    static_assert(std::is_convertible<complex<float>, vec<complex<float>, 4>>::value, "");
+    static_assert(!std::is_convertible<vec<complex<float>, 1>, vec<complex<float>, 4>>::value, "");
+
+    static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, "");
+    static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, "");
+
+    CHECK(static_cast<complex<float>>(10.f) == complex<float>{ 10.f, 0.f });
+    CHECK(static_cast<complex<double>>(10.f) == complex<double>{ 10., 0. });
+    CHECK(static_cast<complex<double>>(static_cast<short>(10)) == complex<double>{ 10., 0. });
+
+    CHECK(static_cast<vec<complex<float>, 2>>(complex<float>{ 1.f, 2.f }) ==
+          vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } });
+
+    CHECK(static_cast<vec<complex<float>, 4>>(complex<float>{ 1.f, 2.f }) ==
+          vec<complex<float>, 4>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f } });
+
+    CHECK(static_cast<vec<complex<double>, 2>>(vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }) ==
+          vec<complex<double>, 2>{ c64{ 1., 2. }, c64{ 1., 2. } });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/operators.cpp b/tests/unit/simd/operators.cpp
@@ -0,0 +1,220 @@
+#include <kfr/simd/horizontal.hpp>
+#include <kfr/simd/operators.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(neg)
+{
+    test_function1(test_catogories::vectors, [](auto x) -> decltype(x) { return -x; },
+                   [](auto x) -> decltype(x) { return -x; });
+}
+
+TEST(bnot)
+{
+    test_function1(test_catogories::vectors, [](auto x) -> decltype(x) { return ~x; },
+                   [](auto x) -> decltype(x) {
+                       utype<decltype(x)> u = ~ubitcast(x);
+                       return bitcast<decltype(x)>(u);
+                   });
+}
+
+TEST(add)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return x + y; },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x + y; });
+}
+
+TEST(sub)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return x - y; },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x - y; });
+}
+
+TEST(mul)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return x * y; },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x * y; });
+}
+
+template <typename T>
+inline bool is_safe_division(T x, T y)
+{
+    return y != T(0) && !(std::is_signed<T>::value && x == std::numeric_limits<T>::min() && y == T(-1));
+}
+
+TEST(div)
+{
+    test_function2(test_catogories::vectors,
+                   [](auto x, auto y) {
+                       return is_safe_division<subtype<decltype(x)>>(x.front(), y.front()) ? x / y : 0;
+                   },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       return is_safe_division(x, y) ? x / y : 0;
+                   });
+}
+
+TEST(bor)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return x | y; },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       using T = common_type<decltype(x), decltype(y)>;
+                       return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) | ubitcast(T(y))));
+                   });
+}
+
+TEST(bxor)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return x ^ y; },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       using T = common_type<decltype(x), decltype(y)>;
+                       return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) ^ ubitcast(T(y))));
+                   });
+}
+
+TEST(band)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return x & y; },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       using T = common_type<decltype(x), decltype(y)>;
+                       return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) & ubitcast(T(y))));
+                   });
+}
+
+TEST(shl)
+{
+    testo::matrix(
+        named("type") = test_catogories::types(test_catogories::vectors), named("value1") = special_values(),
+        named("shift") = std::vector<unsigned>{ 1, 2, 7, 8, 9, 15, 16, 31, 32, 63, 64 },
+        [&](auto type, special_value value, unsigned shift) {
+            using T = type_of<decltype(type)>;
+            if (shift < sizeof(subtype<T>))
+            {
+                const T x(value);
+                CHECK(std::is_same<decltype(x << shift), T>::value);
+                CHECK((x << shift) == apply(
+                                          [=](auto x) -> decltype(x) {
+                                              return bitcast<decltype(x)>(
+                                                  static_cast<uitype<decltype(x)>>(uibitcast(x) << shift));
+                                          },
+                                          x));
+                CHECK((x << broadcast<T::scalar_size()>(utype<subtype<T>>(shift))) ==
+                      apply(
+                          [=](auto x) -> decltype(x) {
+                              return bitcast<decltype(x)>(
+                                  static_cast<uitype<decltype(x)>>(uibitcast(x) << shift));
+                          },
+                          x));
+            }
+        });
+}
+
+TEST(shr)
+{
+    testo::matrix(
+        named("type") = test_catogories::types(test_catogories::vectors), named("value1") = special_values(),
+        named("shift") = std::vector<unsigned>{ 1, 2, 7, 8, 9, 15, 16, 31, 32, 63, 64 },
+        [&](auto type, special_value value, unsigned shift) {
+            using T = type_of<decltype(type)>;
+            if (shift < sizeof(subtype<T>))
+            {
+                const T x(value);
+                CHECK(std::is_same<decltype(x << shift), T>::value);
+                CHECK((x >> shift) == apply(
+                                          [=](auto x) -> decltype(x) {
+                                              return bitcast<decltype(x)>(
+                                                  static_cast<uitype<decltype(x)>>(uibitcast(x) >> shift));
+                                          },
+                                          x));
+                CHECK((x >> broadcast<T::scalar_size()>(utype<subtype<T>>(shift))) ==
+                      apply(
+                          [=](auto x) -> decltype(x) {
+                              return bitcast<decltype(x)>(
+                                  static_cast<uitype<decltype(x)>>(uibitcast(x) >> shift));
+                          },
+                          x));
+            }
+        });
+}
+
+TEST(eq)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return (x == y).asvec(); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       return internal::maskbits<subtype<decltype(x)>>(x == y);
+                   });
+}
+
+TEST(ne)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return (x != y).asvec(); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       return internal::maskbits<subtype<decltype(x)>>(x != y);
+                   });
+}
+
+TEST(ge)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return (x >= y).asvec(); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       return internal::maskbits<subtype<decltype(x)>>(x >= y);
+                   });
+}
+
+TEST(le)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return (x <= y).asvec(); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       return internal::maskbits<subtype<decltype(x)>>(x <= y);
+                   });
+}
+
+TEST(gt)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return (x > y).asvec(); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       return internal::maskbits<subtype<decltype(x)>>(x > y);
+                   });
+}
+
+TEST(lt)
+{
+    test_function2(test_catogories::vectors, [](auto x, auto y) { return (x < y).asvec(); },
+                   [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+                       return internal::maskbits<subtype<decltype(x)>>(x < y);
+                   });
+}
+
+TEST(horner)
+{
+    CHECK(horner(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 17, 34));
+    CHECK(horner_odd(pack(0, 1, 2, 3), 1, 2, 3) == pack(0, 6, 114, 786));
+    CHECK(horner_even(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 57, 262));
+}
+
+TEST(matrix)
+{
+    using i32x2x2 = vec<vec<int, 2>, 2>;
+    const i32x2x2 m22{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
+    CHECK(m22 * 10 == i32x2x2{ i32x2{ 10, 20 }, i32x2{ 30, 40 } });
+
+    CHECK(m22 * i32x2{ -1, 100 } == i32x2x2{ i32x2{ -1, 200 }, i32x2{ -3, 400 } });
+
+    i32x2 xy{ 10, 20 };
+    i32x2x2 m{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
+    xy = hadd(xy * m);
+    CHECK(xy == i32x2{ 40, 120 });
+
+    i32x2 xy2{ 10, 20 };
+    xy2 = hadd(transpose(xy2 * m));
+    CHECK(xy2 == i32x2{ 50, 110 });
+}
+
+TEST(apply)
+{
+    CHECK(apply([](int x) { return x + 1; }, make_vector(1, 2, 3, 4, 5)) == make_vector(2, 3, 4, 5, 6));
+    CHECK(apply(fn::sqr(), make_vector(1, 2, 3, 4, 5)) == make_vector(1, 4, 9, 16, 25));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/shuffle.cpp b/tests/unit/simd/shuffle.cpp
@@ -0,0 +1,160 @@
+#include <kfr/simd/shuffle.hpp>
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(concat)
+{
+    CHECK(concat(vec<f32, 1>{ 1 }, vec<f32, 2>{ 2, 3 }, vec<f32, 1>{ 4 }, vec<f32, 3>{ 5, 6, 7 }) //
+          == vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 });
+}
+
+TEST(reverse)
+{
+    CHECK(reverse(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(7, 6, 5, 4, 3, 2, 1, 0));
+    CHECK(reverse<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(6, 7, 4, 5, 2, 3, 0, 1));
+    CHECK(reverse<4>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(4, 5, 6, 7, 0, 1, 2, 3));
+}
+
+TEST(shuffle)
+{
+    const vec<int, 8> numbers1 = enumerate<int, 8>();
+    const vec<int, 8> numbers2 = enumerate<int, 8, 100>();
+    CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) ==
+          vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
+    CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
+}
+
+TEST(permute)
+{
+    const vec<int, 8> numbers1 = enumerate<int, 8>();
+    CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+    CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+}
+
+TEST(blend)
+{
+    const vec<int, 8> numbers1 = enumerate<int, 8>();
+    const vec<int, 8> numbers2 = enumerate<int, 8, 100>();
+    CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) ==
+          vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
+    CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) ==
+          vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
+}
+
+TEST(duplicate_shuffle)
+{
+    CHECK(dup(pack(0, 1, 2, 3)) == pack(0, 0, 1, 1, 2, 2, 3, 3));
+    CHECK(duphalfs(pack(0, 1, 2, 3)) == pack(0, 1, 2, 3, 0, 1, 2, 3));
+    CHECK(dupeven(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 0, 2, 2, 4, 4, 6, 6));
+    CHECK(dupodd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 1, 3, 3, 5, 5, 7, 7));
+}
+
+TEST(split_interleave)
+{
+    vec<f32, 1> a1;
+    vec<f32, 2> a23;
+    vec<f32, 1> a4;
+    vec<f32, 3> a567;
+    split(vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }, a1, a23, a4, a567);
+    CHECK(a1 == vec<f32, 1>{ 1 });
+    CHECK(a23 == vec<f32, 2>{ 2, 3 });
+    CHECK(a4 == vec<f32, 1>{ 4 });
+    CHECK(a567 == vec<f32, 3>{ 5, 6, 7 });
+
+    CHECK(splitpairs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6, 1, 3, 5, 7));
+    CHECK(splitpairs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
+
+    CHECK(interleavehalfs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 4, 1, 5, 2, 6, 3, 7));
+    CHECK(interleavehalfs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
+}
+
+TEST(broadcast)
+{
+    CHECK(broadcast<8>(1) == pack(1, 1, 1, 1, 1, 1, 1, 1));
+    CHECK(broadcast<8>(1, 2) == pack(1, 2, 1, 2, 1, 2, 1, 2));
+    CHECK(broadcast<8>(1, 2, 3, 4) == pack(1, 2, 3, 4, 1, 2, 3, 4));
+    CHECK(broadcast<8>(1, 2, 3, 4, 5, 6, 7, 8) == pack(1, 2, 3, 4, 5, 6, 7, 8));
+
+    CHECK(broadcast<5>(3.f) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
+    CHECK(broadcast<6>(1.f, 2.f) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
+    CHECK(broadcast<6>(1.f, 2.f, 3.f) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
+}
+
+TEST(resize)
+{
+    CHECK(resize<5>(make_vector(3.f)) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
+    CHECK(resize<6>(make_vector(1.f, 2.f)) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
+    CHECK(resize<6>(make_vector(1.f, 2.f, 3.f)) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
+}
+
+TEST(make_vector)
+{
+    const signed char ch = -1;
+    CHECK(make_vector(1, 2, ch) == vec<i32, 3>{ 1, 2, -1 });
+    const i64 v = -100;
+    CHECK(make_vector(1, 2, v) == vec<i64, 3>{ 1, 2, -100 });
+    CHECK(make_vector<i64>(1, 2, ch) == vec<i64, 3>{ 1, 2, -1 });
+    CHECK(make_vector<f32>(1, 2, ch) == vec<f32, 3>{ 1, 2, -1 });
+
+    CHECK(make_vector(f64x2{ 1, 2 }, f64x2{ 10, 20 }) ==
+          vec<vec<f64, 2>, 2>{ f64x2{ 1, 2 }, f64x2{ 10, 20 } });
+    CHECK(make_vector(1.f, f32x2{ 10, 20 }) == vec<vec<f32, 2>, 2>{ f32x2{ 1, 1 }, f32x2{ 10, 20 } });
+}
+
+TEST(zerovector)
+{
+    CHECK(zerovector<f32, 3>() == f32x3{ 0, 0, 0 });
+    // CHECK(zerovector<i16, 3>() == i16x3{ 0, 0, 0 }); // clang 3.9 (trunk) crashes here
+    CHECK(zerovector(f64x8{}) == f64x8{ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(allonesvector)
+{
+    CHECK(bitcast<u32>(special_constants<f32>::allones()) == 0xFFFFFFFFu);
+    CHECK(bitcast<u64>(special_constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull);
+
+    CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 });
+    CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 });
+}
+
+TEST(transpose)
+{
+    const auto sixteen = enumerate<float, 16>();
+    CHECK(transpose<4>(sixteen) == vec<float, 16>(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));
+}
+
+TEST(odd_even)
+{
+    CHECK(even(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6));
+    CHECK(odd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 3, 5, 7));
+
+    CHECK(even<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5));
+    CHECK(odd<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(2, 3, 6, 7));
+}
+
+TEST(low_high)
+{
+    CHECK(low(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(1, 2, 3, 4));
+    CHECK(high(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(5, 6, 7, 8));
+
+    CHECK(low(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 4>(1, 2, 3, 4));
+    CHECK(high(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 3>(5, 6, 7));
+
+    CHECK(low(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 4>(1, 2, 3, 4));
+    CHECK(high(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 2>(5, 6));
+
+    CHECK(low(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 4>(1, 2, 3, 4));
+    CHECK(high(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 1>(5));
+
+    CHECK(low(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(1, 2));
+    CHECK(high(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(3, 4));
+
+    CHECK(low(vec<u8, 3>(1, 2, 3)) == vec<u8, 2>(1, 2));
+    CHECK(high(vec<u8, 3>(1, 2, 3)) == vec<u8, 1>(3));
+
+    CHECK(low(vec<u8, 2>(1, 2)) == vec<u8, 1>(1));
+    CHECK(high(vec<u8, 2>(1, 2)) == vec<u8, 1>(2));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp
@@ -0,0 +1,114 @@
+#include <kfr/simd/vec.hpp>
+
+#include <kfr/io/tostring.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(cones)
+{
+    CHECK(vec<int, 2>(cones) == vec<int, 2>(-1, -1));
+    CHECK(vec<float, 2>(cones) == vec<f32, 2>(bitcast<f32>(-1), bitcast<f32>(-1)));
+}
+TEST(vec_broadcast)
+{
+    CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f });
+    CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 });
+    CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 });
+}
+template <typename Tout, typename Tin>
+bool is_in_range_of(Tin x)
+{
+    return (is_f_class<Tin>::value && is_f_class<Tout>::value) || static_cast<Tin>(static_cast<Tout>(x)) == x;
+}
+
+TEST(cast)
+{
+    testo::assert_is_same<i32x4, kfr::common_type<i32x4>>();
+    testo::assert_is_same<u32x4, kfr::common_type<i32x4, u32x4>>();
+    testo::assert_is_same<f64x4, kfr::common_type<i32x4, u32x4, f64x4>>();
+
+    CHECK(static_cast<i32x4>(u16x4{ 1, 2, 3, 4 }) == i32x4{ 1, 2, 3, 4 });
+
+    CHECK(static_cast<vec<vec<double, 4>, 2>>(vec<vec<float, 4>, 2>{
+              vec<float, 4>{ 1.f, 2.f, 3.f, 4.f }, vec<float, 4>{ 11.f, 22.f, 33.f, 44.f } }) ==
+          vec<vec<double, 4>, 2>{ vec<double, 4>{ 1., 2., 3., 4. }, vec<double, 4>{ 11., 22., 33., 44. } });
+
+    static_assert(std::is_convertible<float, f32x4>::value, "");
+    static_assert(std::is_convertible<float, f64x8>::value, "");
+    static_assert(std::is_convertible<float, u8x3>::value, "");
+
+    static_assert(std::is_convertible<u16x4, i32x4>::value, "");
+    static_assert(!std::is_convertible<u16x4, i32x3>::value, "");
+    static_assert(!std::is_convertible<u16x1, u16x16>::value, "");
+
+    static_assert(is_same<decltype(innercast<f64>(f32x4x4(1))), f64x4x4>::value, "");
+    static_assert(is_same<decltype(innercast<f64>(f32x4(1))), f64x4>::value, "");
+    static_assert(is_same<decltype(innercast<f64>(f32(1))), f64>::value, "");
+
+    // N/A static_assert(is_same<decltype(innercast<f64x4>(f32x4x4(1))), f64x4x4>::value, "");
+    static_assert(is_same<decltype(innercast<f64x4>(f32x4(1))), f64x4x4>::value, "");
+    static_assert(is_same<decltype(innercast<f64x4>(f32(1))), f64x4>::value, "");
+
+    // N/A static_assert(is_same<decltype(elemcast<f64>(f32x4x4(1))), f64x4>::value, "");
+    static_assert(is_same<decltype(elemcast<f64>(f32x4(1))), f64x4>::value, "");
+
+    static_assert(is_same<decltype(elemcast<f64x4>(f32x4x4(1))), f64x4x4>::value, "");
+    static_assert(is_same<decltype(elemcast<f64x4>(f32x4(1))), f64x4x4>::value, "");
+
+    testo::scope s("");
+    s.text = ("target_type = u8");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<u8>(x); },
+        [](auto x) -> u8 { return static_cast<u8>(x); },
+        [](auto t, special_value x) { return is_in_range_of<u8>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = i8");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<i8>(x); },
+        [](auto x) -> i8 { return static_cast<i8>(x); },
+        [](auto t, special_value x) { return is_in_range_of<i8>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = u16");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<u16>(x); },
+        [](auto x) -> u16 { return static_cast<u16>(x); },
+        [](auto t, special_value x) { return is_in_range_of<u16>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = i16");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<i16>(x); },
+        [](auto x) -> i16 { return static_cast<i16>(x); },
+        [](auto t, special_value x) { return is_in_range_of<i16>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = u32");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<u32>(x); },
+        [](auto x) -> u32 { return static_cast<u32>(x); },
+        [](auto t, special_value x) { return is_in_range_of<u32>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = i32");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<i32>(x); },
+        [](auto x) -> i32 { return static_cast<i32>(x); },
+        [](auto t, special_value x) { return is_in_range_of<i32>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = u64");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<u64>(x); },
+        [](auto x) -> u64 { return static_cast<u64>(x); },
+        [](auto t, special_value x) { return is_in_range_of<u64>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = i64");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<i64>(x); },
+        [](auto x) -> i64 { return static_cast<i64>(x); },
+        [](auto t, special_value x) { return is_in_range_of<i64>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = f32");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<f32>(x); },
+        [](auto x) -> f32 { return static_cast<f32>(x); },
+        [](auto t, special_value x) { return is_in_range_of<f32>(x.get<subtype<type_of<decltype(t)>>>()); });
+    s.text = ("target_type = f64");
+    test_function1(
+        test_catogories::all, [](auto x) { return kfr::innercast<f64>(x); },
+        [](auto x) -> f64 { return static_cast<f64>(x); },
+        [](auto t, special_value x) { return is_in_range_of<f64>(x.get<subtype<type_of<decltype(t)>>>()); });
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+# This file is part of KFR
+#
+# KFR is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# KFR is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with KFR.
+
+
+cmake_minimum_required(VERSION 3.1)
+
+# Binary output directories
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
+
+add_executable(sample_rate_converter sample_rate_converter.cpp)
+target_link_libraries(sample_rate_converter kfr kfr_io use_arch)
+
+add_executable(ebu_test ebu_test.cpp)
+target_link_libraries(ebu_test kfr kfr_io use_arch)
diff --git a/tools/ebu_test.cpp b/tools/ebu_test.cpp
@@ -0,0 +1,120 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/base.hpp>
+#include <kfr/dsp.hpp>
+#include <kfr/io.hpp>
+
+using namespace kfr;
+
+int main(int argc, char** argv)
+{
+    if (argc < 3)
+    {
+        println("Usage: ebu_test INPUT_IN_F32_RAW_FORMAT CHANNEL_NUMBER");
+        return 1;
+    }
+
+    // Prepare
+    FILE* f                  = fopen(argv[1], "rb");
+    const int channel_number = atoi(argv[2]);
+    if (channel_number < 1 || channel_number > 6)
+    {
+        println("Incorrect number of channels");
+        return 1;
+    }
+    fseek(f, 0, SEEK_END);
+    uintmax_t size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    if (size % (sizeof(float) * channel_number))
+    {
+        println("Incorrect file size");
+        return 1;
+    }
+
+    // Read file
+    const size_t length = size / (sizeof(float) * channel_number);
+    univector<float> interleaved(size / sizeof(float));
+    size_t read_len = fread(interleaved.data(), 1, size, f);
+    if (read_len != size)
+    {
+        println("Can't read file");
+        return 1;
+    }
+
+    // Deinterleave
+    univector<univector<float>> data(channel_number, univector<float>(length));
+    for (size_t ch = 0; ch < channel_number; ++ch)
+    {
+        for (size_t i = 0; i < length; ++i)
+        {
+            data[ch][i] = interleaved[i * channel_number + ch];
+        }
+    }
+
+    std::vector<Speaker> speakers;
+    switch (channel_number)
+    {
+    case 1:
+        speakers = { Speaker::Mono };
+        break;
+    case 2:
+        speakers = { Speaker::Left, Speaker::Right };
+        break;
+    case 3:
+        speakers = { Speaker::Left, Speaker::Right, Speaker::Center };
+        break;
+    case 4:
+        speakers = { Speaker::Left, Speaker::Right, Speaker::LeftSurround, Speaker::RightSurround };
+        break;
+    case 5:
+        speakers = { Speaker::Left, Speaker::Right, Speaker::Center, Speaker::LeftSurround,
+                     Speaker::RightSurround };
+        break;
+    case 6:
+        speakers = { Speaker::Left,         Speaker::Right,         Speaker::Center,
+                     Speaker::LeftSurround, Speaker::RightSurround, Speaker::Lfe };
+        break;
+    }
+
+    ebu_r128<float> loudness(48000, speakers);
+
+    float M, S, I, RL, RH;
+    float maxM = -HUGE_VALF, maxS = -HUGE_VALF;
+    for (size_t i = 0; i < length / loudness.packet_size(); i++)
+    {
+        std::vector<univector_ref<float>> channels;
+        for (size_t ch = 0; ch < channel_number; ++ch)
+        {
+            channels.push_back(data[ch].slice(i * loudness.packet_size(), loudness.packet_size()));
+        }
+        loudness.process_packet(channels);
+        loudness.get_values(M, S, I, RL, RH);
+        maxM = std::max(maxM, M);
+        maxS = std::max(maxS, S);
+    }
+
+    {
+        // For file-based measurements, the signal should be followed by at least 1.5 s of silence
+        std::vector<univector_dyn<float>> channels(channel_number,
+                                                   univector_dyn<float>(loudness.packet_size()));
+        for (size_t i = 0; i < 15; ++i)
+            loudness.process_packet(channels);
+        float dummyM, dummyS, dummyI;
+        loudness.get_values(dummyM, dummyS, dummyI, RL, RH);
+    }
+
+    println(argv[1]);
+    println("M = ", M);
+    println("S = ", S);
+    println("I = ", I);
+    println("LRA = ", RH - RL);
+    println("maxM = ", maxM);
+    println("maxS = ", maxS);
+    println();
+
+    return 0;
+}
diff --git a/examples/sample_rate_converter.cpp b/tools/sample_rate_converter.cpp
diff --git a/update-sources.py b/update-sources.py
@@ -7,25 +7,33 @@ import subprocess
 import sys
 import glob
 
-path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'include')
+def list_sources(name, searchpath, masks):
+    global cmake
+    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), searchpath)
+    filenames = []
+    for root, dirnames, files in os.walk(path, path):
+        for mask in masks:
+            for filename in fnmatch.filter(files, mask):
+                filenames.append(os.path.relpath(os.path.join(root, filename), path).replace('\\','/'))
 
-masks = ['*.hpp', '*.h', '*.i', '*.inc']
+    cmake += """
+set(
+    """ + name + """
+    """ + "\n    ".join(['${PROJECT_SOURCE_DIR}/' + searchpath + '/' + f for f in filenames]) + """
+)
 
-filenames = []
-for root, dirnames, files in os.walk(path, path):
-    for mask in masks:
-        for filename in fnmatch.filter(files, mask):
-            filenames.append(os.path.relpath(os.path.join(root, filename), path).replace('\\','/'))
+    """
 
 cmake = """
 # Auto-generated file. Do not edit
 # Use update-sources.py
-
-set(
-    KFR_SRC
-    """ + "\n    ".join(['${PROJECT_SOURCE_DIR}/include/' + f for f in filenames]) + """
-)
 """
 
+list_sources("KFR_SRC", "include", ['*.hpp', '*.h', '*.i', '*.inc'])
+list_sources("KFR_DFT_SRC", "include/kfr/dft", ['*.cpp'])
+list_sources("KFR_IO_SRC", "include/kfr/io", ['*.cpp'])
+
+list_sources("KFR_UNITTEST_SRC", "tests/unit", ['*.cpp'])
+
 with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sources.cmake'), "w") as f:
     f.write(cmake)

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README